Text Generation
Transformers
PyTorch
Safetensors
Swedish
ctrl
Inference Endpoints
SweCTRL-Mini / trainer_state.json
dkalpakchi's picture
Uploaded model, tokenizer and the minimally necessary code
59392d8
raw
history blame
34.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 28.5,
"global_step": 2786844,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.1,
"learning_rate": 0.020000000000000004,
"loss": 5.3687,
"step": 10000
},
{
"epoch": 0.2,
"learning_rate": 0.04000000000000001,
"loss": 4.0531,
"step": 20000
},
{
"epoch": 0.31,
"learning_rate": 0.04993567245443037,
"loss": 3.8149,
"step": 30000
},
{
"epoch": 0.41,
"learning_rate": 0.0498070173632911,
"loss": 3.6549,
"step": 40000
},
{
"epoch": 0.51,
"learning_rate": 0.04967836227215183,
"loss": 3.5544,
"step": 50000
},
{
"epoch": 0.61,
"learning_rate": 0.049549707181012564,
"loss": 3.4935,
"step": 60000
},
{
"epoch": 0.72,
"learning_rate": 0.0494210520898733,
"loss": 3.447,
"step": 70000
},
{
"epoch": 0.82,
"learning_rate": 0.049292396998734035,
"loss": 3.4134,
"step": 80000
},
{
"epoch": 0.92,
"learning_rate": 0.04916374190759477,
"loss": 3.3861,
"step": 90000
},
{
"epoch": 1.02,
"learning_rate": 0.049035086816455506,
"loss": 3.3523,
"step": 100000
},
{
"epoch": 1.12,
"learning_rate": 0.04890643172531624,
"loss": 3.3063,
"step": 110000
},
{
"epoch": 1.23,
"learning_rate": 0.04877777663417697,
"loss": 3.2983,
"step": 120000
},
{
"epoch": 1.33,
"learning_rate": 0.0486491215430377,
"loss": 3.2874,
"step": 130000
},
{
"epoch": 1.43,
"learning_rate": 0.04852046645189844,
"loss": 3.2785,
"step": 140000
},
{
"epoch": 1.53,
"learning_rate": 0.04839181136075917,
"loss": 3.2698,
"step": 150000
},
{
"epoch": 1.64,
"learning_rate": 0.048263156269619904,
"loss": 3.2599,
"step": 160000
},
{
"epoch": 1.74,
"learning_rate": 0.04813450117848064,
"loss": 3.2503,
"step": 170000
},
{
"epoch": 1.84,
"learning_rate": 0.048005846087341375,
"loss": 3.2432,
"step": 180000
},
{
"epoch": 1.94,
"learning_rate": 0.04787719099620211,
"loss": 3.2347,
"step": 190000
},
{
"epoch": 2.05,
"learning_rate": 0.04774853590506284,
"loss": 3.1812,
"step": 200000
},
{
"epoch": 2.15,
"learning_rate": 0.04761988081392357,
"loss": 3.1865,
"step": 210000
},
{
"epoch": 2.25,
"learning_rate": 0.0474912257227843,
"loss": 3.1873,
"step": 220000
},
{
"epoch": 2.35,
"learning_rate": 0.047362570631645035,
"loss": 3.1842,
"step": 230000
},
{
"epoch": 2.45,
"learning_rate": 0.047233915540505766,
"loss": 3.1824,
"step": 240000
},
{
"epoch": 2.56,
"learning_rate": 0.047105260449366505,
"loss": 3.1806,
"step": 250000
},
{
"epoch": 2.66,
"learning_rate": 0.04697660535822724,
"loss": 3.177,
"step": 260000
},
{
"epoch": 2.76,
"learning_rate": 0.04684795026708797,
"loss": 3.1741,
"step": 270000
},
{
"epoch": 2.86,
"learning_rate": 0.04671929517594871,
"loss": 3.1709,
"step": 280000
},
{
"epoch": 2.97,
"learning_rate": 0.04659064008480944,
"loss": 3.1682,
"step": 290000
},
{
"epoch": 3.07,
"learning_rate": 0.04646198499367017,
"loss": 3.1382,
"step": 300000
},
{
"epoch": 3.17,
"learning_rate": 0.046333329902530904,
"loss": 3.1283,
"step": 310000
},
{
"epoch": 3.27,
"learning_rate": 0.04620467481139164,
"loss": 3.1318,
"step": 320000
},
{
"epoch": 3.37,
"learning_rate": 0.046076019720252374,
"loss": 3.1319,
"step": 330000
},
{
"epoch": 3.48,
"learning_rate": 0.045947364629113106,
"loss": 3.1335,
"step": 340000
},
{
"epoch": 3.58,
"learning_rate": 0.04581870953797384,
"loss": 3.1326,
"step": 350000
},
{
"epoch": 3.68,
"learning_rate": 0.04569005444683458,
"loss": 3.1306,
"step": 360000
},
{
"epoch": 3.78,
"learning_rate": 0.04556139935569531,
"loss": 3.1289,
"step": 370000
},
{
"epoch": 3.89,
"learning_rate": 0.04543274426455604,
"loss": 3.1275,
"step": 380000
},
{
"epoch": 3.99,
"learning_rate": 0.04530408917341677,
"loss": 3.1259,
"step": 390000
},
{
"epoch": 4.09,
"learning_rate": 0.045175434082277505,
"loss": 3.0882,
"step": 400000
},
{
"epoch": 4.19,
"learning_rate": 0.04504677899113824,
"loss": 3.094,
"step": 410000
},
{
"epoch": 4.3,
"learning_rate": 0.04491812389999897,
"loss": 3.0958,
"step": 420000
},
{
"epoch": 4.4,
"learning_rate": 0.04478946880885971,
"loss": 3.0976,
"step": 430000
},
{
"epoch": 4.5,
"learning_rate": 0.04466081371772044,
"loss": 3.0996,
"step": 440000
},
{
"epoch": 4.6,
"learning_rate": 0.04453215862658117,
"loss": 3.099,
"step": 450000
},
{
"epoch": 4.7,
"learning_rate": 0.04440350353544191,
"loss": 3.1,
"step": 460000
},
{
"epoch": 4.81,
"learning_rate": 0.04427484844430264,
"loss": 3.0993,
"step": 470000
},
{
"epoch": 4.91,
"learning_rate": 0.044146193353163374,
"loss": 3.0985,
"step": 480000
},
{
"epoch": 5.01,
"learning_rate": 0.044017538262024106,
"loss": 3.094,
"step": 490000
},
{
"epoch": 5.11,
"learning_rate": 0.043888883170884845,
"loss": 3.0646,
"step": 500000
},
{
"epoch": 5.22,
"learning_rate": 0.043760228079745576,
"loss": 3.0694,
"step": 510000
},
{
"epoch": 5.32,
"learning_rate": 0.04363157298860631,
"loss": 3.0712,
"step": 520000
},
{
"epoch": 5.42,
"learning_rate": 0.04350291789746704,
"loss": 3.0736,
"step": 530000
},
{
"epoch": 5.52,
"learning_rate": 0.04337426280632778,
"loss": 3.0752,
"step": 540000
},
{
"epoch": 5.62,
"learning_rate": 0.04324560771518851,
"loss": 3.0759,
"step": 550000
},
{
"epoch": 5.73,
"learning_rate": 0.04311695262404924,
"loss": 3.0762,
"step": 560000
},
{
"epoch": 5.83,
"learning_rate": 0.042988297532909975,
"loss": 3.0756,
"step": 570000
},
{
"epoch": 5.93,
"learning_rate": 0.04285964244177071,
"loss": 3.076,
"step": 580000
},
{
"epoch": 6.03,
"learning_rate": 0.04273098735063144,
"loss": 3.0412,
"step": 590000
},
{
"epoch": 6.14,
"learning_rate": 0.04260233225949217,
"loss": 3.0464,
"step": 600000
},
{
"epoch": 6.24,
"learning_rate": 0.04247367716835291,
"loss": 3.05,
"step": 610000
},
{
"epoch": 6.34,
"learning_rate": 0.04234502207721364,
"loss": 3.0539,
"step": 620000
},
{
"epoch": 6.44,
"learning_rate": 0.04221636698607437,
"loss": 3.0554,
"step": 630000
},
{
"epoch": 6.55,
"learning_rate": 0.042087711894935105,
"loss": 3.0558,
"step": 640000
},
{
"epoch": 6.65,
"learning_rate": 0.041959056803795844,
"loss": 3.057,
"step": 650000
},
{
"epoch": 6.75,
"learning_rate": 0.041830401712656576,
"loss": 3.0584,
"step": 660000
},
{
"epoch": 6.85,
"learning_rate": 0.04170174662151731,
"loss": 3.0585,
"step": 670000
},
{
"epoch": 6.95,
"learning_rate": 0.04157309153037805,
"loss": 3.0593,
"step": 680000
},
{
"epoch": 7.06,
"learning_rate": 0.04144443643923878,
"loss": 3.0408,
"step": 690000
},
{
"epoch": 7.16,
"learning_rate": 0.04131578134809951,
"loss": 3.0325,
"step": 700000
},
{
"epoch": 7.26,
"learning_rate": 0.04118712625696024,
"loss": 3.035,
"step": 710000
},
{
"epoch": 7.36,
"learning_rate": 0.04105847116582098,
"loss": 3.0373,
"step": 720000
},
{
"epoch": 7.47,
"learning_rate": 0.04092981607468171,
"loss": 3.0405,
"step": 730000
},
{
"epoch": 7.57,
"learning_rate": 0.040801160983542445,
"loss": 3.0403,
"step": 740000
},
{
"epoch": 7.67,
"learning_rate": 0.04067250589240318,
"loss": 3.0431,
"step": 750000
},
{
"epoch": 7.77,
"learning_rate": 0.04054385080126391,
"loss": 3.0444,
"step": 760000
},
{
"epoch": 7.87,
"learning_rate": 0.04041519571012464,
"loss": 3.0445,
"step": 770000
},
{
"epoch": 7.98,
"learning_rate": 0.04028654061898537,
"loss": 3.0452,
"step": 780000
},
{
"epoch": 8.08,
"learning_rate": 0.04015788552784611,
"loss": 3.0217,
"step": 790000
},
{
"epoch": 8.18,
"learning_rate": 0.04002923043670684,
"loss": 3.02,
"step": 800000
},
{
"epoch": 8.28,
"learning_rate": 0.039900575345567575,
"loss": 3.0233,
"step": 810000
},
{
"epoch": 8.39,
"learning_rate": 0.03977192025442831,
"loss": 3.0259,
"step": 820000
},
{
"epoch": 8.49,
"learning_rate": 0.039643265163289046,
"loss": 3.0271,
"step": 830000
},
{
"epoch": 8.59,
"learning_rate": 0.03951461007214978,
"loss": 3.0121,
"step": 840000
},
{
"epoch": 8.69,
"learning_rate": 0.03938595498101051,
"loss": 3.0161,
"step": 850000
},
{
"epoch": 8.79,
"learning_rate": 0.03925729988987125,
"loss": 3.0195,
"step": 860000
},
{
"epoch": 8.9,
"learning_rate": 0.03912864479873198,
"loss": 3.021,
"step": 870000
},
{
"epoch": 9.0,
"learning_rate": 0.03899998970759271,
"loss": 3.0228,
"step": 880000
},
{
"epoch": 9.1,
"learning_rate": 0.038871334616453444,
"loss": 3.0073,
"step": 890000
},
{
"epoch": 9.2,
"learning_rate": 0.03874267952531418,
"loss": 3.0114,
"step": 900000
},
{
"epoch": 9.31,
"learning_rate": 0.038614024434174915,
"loss": 3.0151,
"step": 910000
},
{
"epoch": 9.41,
"learning_rate": 0.03848536934303565,
"loss": 3.0175,
"step": 920000
},
{
"epoch": 9.51,
"learning_rate": 0.03835671425189638,
"loss": 3.0193,
"step": 930000
},
{
"epoch": 9.61,
"learning_rate": 0.03822805916075711,
"loss": 3.0185,
"step": 940000
},
{
"epoch": 9.72,
"learning_rate": 0.03809940406961784,
"loss": 3.0228,
"step": 950000
},
{
"epoch": 9.82,
"learning_rate": 0.037970748978478575,
"loss": 3.0226,
"step": 960000
},
{
"epoch": 9.92,
"learning_rate": 0.03784209388733931,
"loss": 3.0227,
"step": 970000
},
{
"epoch": 10.02,
"learning_rate": 0.037713438796200045,
"loss": 3.017,
"step": 980000
},
{
"epoch": 10.12,
"learning_rate": 0.03758478370506078,
"loss": 2.9992,
"step": 990000
},
{
"epoch": 10.23,
"learning_rate": 0.03745612861392151,
"loss": 3.0007,
"step": 1000000
},
{
"epoch": 10.33,
"learning_rate": 0.03732747352278225,
"loss": 3.0047,
"step": 1010000
},
{
"epoch": 10.43,
"learning_rate": 0.03719881843164298,
"loss": 3.0075,
"step": 1020000
},
{
"epoch": 10.53,
"learning_rate": 0.03707016334050371,
"loss": 3.0095,
"step": 1030000
},
{
"epoch": 10.64,
"learning_rate": 0.03694150824936445,
"loss": 3.0101,
"step": 1040000
},
{
"epoch": 10.74,
"learning_rate": 0.03681285315822518,
"loss": 3.0123,
"step": 1050000
},
{
"epoch": 10.84,
"learning_rate": 0.036684198067085914,
"loss": 3.013,
"step": 1060000
},
{
"epoch": 10.94,
"learning_rate": 0.036555542975946646,
"loss": 3.0121,
"step": 1070000
},
{
"epoch": 11.04,
"learning_rate": 0.03642688788480738,
"loss": 2.9892,
"step": 1080000
},
{
"epoch": 11.15,
"learning_rate": 0.03629823279366811,
"loss": 2.9908,
"step": 1090000
},
{
"epoch": 11.25,
"learning_rate": 0.03616957770252884,
"loss": 2.9941,
"step": 1100000
},
{
"epoch": 11.35,
"learning_rate": 0.036040922611389574,
"loss": 2.9976,
"step": 1110000
},
{
"epoch": 11.45,
"learning_rate": 0.03591226752025031,
"loss": 2.9992,
"step": 1120000
},
{
"epoch": 11.56,
"learning_rate": 0.035783612429111045,
"loss": 3.0007,
"step": 1130000
},
{
"epoch": 11.66,
"learning_rate": 0.03565495733797178,
"loss": 3.0021,
"step": 1140000
},
{
"epoch": 11.76,
"learning_rate": 0.035526302246832515,
"loss": 3.0032,
"step": 1150000
},
{
"epoch": 11.86,
"learning_rate": 0.03539764715569325,
"loss": 3.0057,
"step": 1160000
},
{
"epoch": 11.97,
"learning_rate": 0.03526899206455398,
"loss": 3.0053,
"step": 1170000
},
{
"epoch": 12.07,
"learning_rate": 0.03514033697341471,
"loss": 2.9898,
"step": 1180000
},
{
"epoch": 12.17,
"learning_rate": 0.03501168188227545,
"loss": 2.9848,
"step": 1190000
},
{
"epoch": 12.27,
"learning_rate": 0.03488302679113618,
"loss": 2.9871,
"step": 1200000
},
{
"epoch": 12.37,
"learning_rate": 0.034754371699996914,
"loss": 2.9903,
"step": 1210000
},
{
"epoch": 12.48,
"learning_rate": 0.03462571660885765,
"loss": 2.9918,
"step": 1220000
},
{
"epoch": 12.58,
"learning_rate": 0.034497061517718385,
"loss": 2.9948,
"step": 1230000
},
{
"epoch": 12.68,
"learning_rate": 0.034368406426579116,
"loss": 2.9955,
"step": 1240000
},
{
"epoch": 12.78,
"learning_rate": 0.03423975133543985,
"loss": 2.9971,
"step": 1250000
},
{
"epoch": 12.89,
"learning_rate": 0.03411109624430058,
"loss": 2.9978,
"step": 1260000
},
{
"epoch": 12.99,
"learning_rate": 0.03398244115316131,
"loss": 2.9985,
"step": 1270000
},
{
"epoch": 13.09,
"learning_rate": 0.033853786062022044,
"loss": 2.9789,
"step": 1280000
},
{
"epoch": 13.19,
"learning_rate": 0.033725130970882776,
"loss": 2.9795,
"step": 1290000
},
{
"epoch": 13.29,
"learning_rate": 0.033596475879743515,
"loss": 2.9835,
"step": 1300000
},
{
"epoch": 13.4,
"learning_rate": 0.03346782078860425,
"loss": 2.9829,
"step": 1310000
},
{
"epoch": 13.5,
"learning_rate": 0.03333916569746498,
"loss": 2.9869,
"step": 1320000
},
{
"epoch": 13.6,
"learning_rate": 0.03321051060632572,
"loss": 2.9755,
"step": 1330000
},
{
"epoch": 13.7,
"learning_rate": 0.03308185551518645,
"loss": 2.978,
"step": 1340000
},
{
"epoch": 13.81,
"learning_rate": 0.03295320042404718,
"loss": 2.9811,
"step": 1350000
},
{
"epoch": 13.91,
"learning_rate": 0.03282454533290791,
"loss": 2.9824,
"step": 1360000
},
{
"epoch": 14.01,
"learning_rate": 0.03269589024176865,
"loss": 2.9832,
"step": 1370000
},
{
"epoch": 14.11,
"learning_rate": 0.032567235150629384,
"loss": 2.9726,
"step": 1380000
},
{
"epoch": 14.22,
"learning_rate": 0.032438580059490116,
"loss": 2.9762,
"step": 1390000
},
{
"epoch": 14.32,
"learning_rate": 0.03230992496835085,
"loss": 2.9786,
"step": 1400000
},
{
"epoch": 14.42,
"learning_rate": 0.03218126987721159,
"loss": 2.9804,
"step": 1410000
},
{
"epoch": 14.52,
"learning_rate": 0.03205261478607232,
"loss": 2.9821,
"step": 1420000
},
{
"epoch": 14.62,
"learning_rate": 0.03192395969493305,
"loss": 2.9825,
"step": 1430000
},
{
"epoch": 14.73,
"learning_rate": 0.03179530460379378,
"loss": 2.985,
"step": 1440000
},
{
"epoch": 14.83,
"learning_rate": 0.031666649512654514,
"loss": 2.9851,
"step": 1450000
},
{
"epoch": 14.93,
"learning_rate": 0.031537994421515246,
"loss": 2.9859,
"step": 1460000
},
{
"epoch": 15.03,
"learning_rate": 0.03140933933037598,
"loss": 2.9795,
"step": 1470000
},
{
"epoch": 15.14,
"learning_rate": 0.03128068423923672,
"loss": 2.9681,
"step": 1480000
},
{
"epoch": 15.24,
"learning_rate": 0.03115202914809745,
"loss": 2.9707,
"step": 1490000
},
{
"epoch": 15.34,
"learning_rate": 0.03102337405695818,
"loss": 2.9727,
"step": 1500000
},
{
"epoch": 15.44,
"learning_rate": 0.03089471896581892,
"loss": 2.9747,
"step": 1510000
},
{
"epoch": 15.54,
"learning_rate": 0.03076606387467965,
"loss": 2.9769,
"step": 1520000
},
{
"epoch": 15.65,
"learning_rate": 0.030637408783540383,
"loss": 2.9778,
"step": 1530000
},
{
"epoch": 15.75,
"learning_rate": 0.030508753692401115,
"loss": 2.9788,
"step": 1540000
},
{
"epoch": 15.85,
"learning_rate": 0.030380098601261854,
"loss": 2.9789,
"step": 1550000
},
{
"epoch": 15.95,
"learning_rate": 0.030251443510122586,
"loss": 2.9807,
"step": 1560000
},
{
"epoch": 16.06,
"learning_rate": 0.030122788418983318,
"loss": 2.9619,
"step": 1570000
},
{
"epoch": 16.16,
"learning_rate": 0.02999413332784405,
"loss": 2.9638,
"step": 1580000
},
{
"epoch": 16.26,
"learning_rate": 0.029865478236704785,
"loss": 2.9654,
"step": 1590000
},
{
"epoch": 16.36,
"learning_rate": 0.029736823145565517,
"loss": 2.9679,
"step": 1600000
},
{
"epoch": 16.46,
"learning_rate": 0.02960816805442625,
"loss": 2.9704,
"step": 1610000
},
{
"epoch": 16.57,
"learning_rate": 0.029479512963286988,
"loss": 2.9726,
"step": 1620000
},
{
"epoch": 16.67,
"learning_rate": 0.02935085787214772,
"loss": 2.9725,
"step": 1630000
},
{
"epoch": 16.77,
"learning_rate": 0.02922220278100845,
"loss": 2.9738,
"step": 1640000
},
{
"epoch": 16.87,
"learning_rate": 0.029093547689869183,
"loss": 2.9747,
"step": 1650000
},
{
"epoch": 16.98,
"learning_rate": 0.02896489259872992,
"loss": 2.9763,
"step": 1660000
},
{
"epoch": 17.08,
"learning_rate": 0.02883623750759065,
"loss": 2.9617,
"step": 1670000
},
{
"epoch": 17.18,
"learning_rate": 0.028707582416451383,
"loss": 2.9618,
"step": 1680000
},
{
"epoch": 17.28,
"learning_rate": 0.028578927325312115,
"loss": 2.9612,
"step": 1690000
},
{
"epoch": 17.39,
"learning_rate": 0.028450272234172853,
"loss": 2.9632,
"step": 1700000
},
{
"epoch": 17.49,
"learning_rate": 0.028321617143033585,
"loss": 2.9655,
"step": 1710000
},
{
"epoch": 17.59,
"learning_rate": 0.028192962051894317,
"loss": 2.9672,
"step": 1720000
},
{
"epoch": 17.69,
"learning_rate": 0.028064306960755056,
"loss": 2.9691,
"step": 1730000
},
{
"epoch": 17.79,
"learning_rate": 0.027935651869615788,
"loss": 2.9698,
"step": 1740000
},
{
"epoch": 17.9,
"learning_rate": 0.02780699677847652,
"loss": 2.9702,
"step": 1750000
},
{
"epoch": 18.0,
"learning_rate": 0.027678341687337252,
"loss": 2.9719,
"step": 1760000
},
{
"epoch": 18.1,
"learning_rate": 0.027549686596197987,
"loss": 2.9546,
"step": 1770000
},
{
"epoch": 18.2,
"learning_rate": 0.02742103150505872,
"loss": 2.9567,
"step": 1780000
},
{
"epoch": 18.31,
"learning_rate": 0.02729237641391945,
"loss": 2.9586,
"step": 1790000
},
{
"epoch": 18.41,
"learning_rate": 0.02716372132278019,
"loss": 2.9606,
"step": 1800000
},
{
"epoch": 18.51,
"learning_rate": 0.02703506623164092,
"loss": 2.9506,
"step": 1810000
},
{
"epoch": 18.61,
"learning_rate": 0.026906411140501654,
"loss": 2.9518,
"step": 1820000
},
{
"epoch": 18.71,
"learning_rate": 0.026777756049362385,
"loss": 2.9575,
"step": 1830000
},
{
"epoch": 18.82,
"learning_rate": 0.02664910095822312,
"loss": 2.9584,
"step": 1840000
},
{
"epoch": 18.92,
"learning_rate": 0.026520445867083853,
"loss": 2.9594,
"step": 1850000
},
{
"epoch": 19.02,
"learning_rate": 0.026391790775944585,
"loss": 2.9578,
"step": 1860000
},
{
"epoch": 19.12,
"learning_rate": 0.026263135684805317,
"loss": 2.9535,
"step": 1870000
},
{
"epoch": 19.23,
"learning_rate": 0.026134480593666055,
"loss": 2.9552,
"step": 1880000
},
{
"epoch": 19.33,
"learning_rate": 0.026005825502526787,
"loss": 2.9568,
"step": 1890000
},
{
"epoch": 19.43,
"learning_rate": 0.02587717041138752,
"loss": 2.9583,
"step": 1900000
},
{
"epoch": 19.53,
"learning_rate": 0.025748515320248258,
"loss": 2.9596,
"step": 1910000
},
{
"epoch": 19.64,
"learning_rate": 0.02561986022910899,
"loss": 2.9599,
"step": 1920000
},
{
"epoch": 19.74,
"learning_rate": 0.025491205137969722,
"loss": 2.9615,
"step": 1930000
},
{
"epoch": 19.84,
"learning_rate": 0.025362550046830454,
"loss": 2.9622,
"step": 1940000
},
{
"epoch": 19.94,
"learning_rate": 0.02523389495569119,
"loss": 2.9637,
"step": 1950000
},
{
"epoch": 20.04,
"learning_rate": 0.02510523986455192,
"loss": 2.9559,
"step": 1960000
},
{
"epoch": 20.15,
"learning_rate": 0.024976584773412653,
"loss": 2.9496,
"step": 1970000
},
{
"epoch": 20.25,
"learning_rate": 0.02484792968227339,
"loss": 2.9516,
"step": 1980000
},
{
"epoch": 20.35,
"learning_rate": 0.024719274591134124,
"loss": 2.9522,
"step": 1990000
},
{
"epoch": 20.45,
"learning_rate": 0.024590619499994856,
"loss": 2.9538,
"step": 2000000
},
{
"epoch": 20.56,
"learning_rate": 0.02446196440885559,
"loss": 2.9552,
"step": 2010000
},
{
"epoch": 20.66,
"learning_rate": 0.024333309317716323,
"loss": 2.957,
"step": 2020000
},
{
"epoch": 20.76,
"learning_rate": 0.024204654226577055,
"loss": 2.9572,
"step": 2030000
},
{
"epoch": 20.86,
"learning_rate": 0.024075999135437787,
"loss": 2.9586,
"step": 2040000
},
{
"epoch": 20.96,
"learning_rate": 0.023947344044298522,
"loss": 2.9609,
"step": 2050000
},
{
"epoch": 21.07,
"learning_rate": 0.023818688953159254,
"loss": 2.9429,
"step": 2060000
},
{
"epoch": 21.17,
"learning_rate": 0.02369003386201999,
"loss": 2.947,
"step": 2070000
},
{
"epoch": 21.27,
"learning_rate": 0.023561378770880725,
"loss": 2.9488,
"step": 2080000
},
{
"epoch": 21.37,
"learning_rate": 0.023432723679741457,
"loss": 2.9491,
"step": 2090000
},
{
"epoch": 21.48,
"learning_rate": 0.023304068588602192,
"loss": 2.9514,
"step": 2100000
},
{
"epoch": 21.58,
"learning_rate": 0.023175413497462924,
"loss": 2.9513,
"step": 2110000
},
{
"epoch": 21.68,
"learning_rate": 0.023046758406323656,
"loss": 2.9537,
"step": 2120000
},
{
"epoch": 21.78,
"learning_rate": 0.022918103315184388,
"loss": 2.9548,
"step": 2130000
},
{
"epoch": 21.88,
"learning_rate": 0.022789448224045123,
"loss": 2.9573,
"step": 2140000
},
{
"epoch": 21.99,
"learning_rate": 0.022660793132905855,
"loss": 2.9565,
"step": 2150000
},
{
"epoch": 22.09,
"learning_rate": 0.02253213804176659,
"loss": 2.9432,
"step": 2160000
},
{
"epoch": 22.19,
"learning_rate": 0.022403482950627322,
"loss": 2.9445,
"step": 2170000
},
{
"epoch": 22.29,
"learning_rate": 0.022274827859488058,
"loss": 2.9465,
"step": 2180000
},
{
"epoch": 22.4,
"learning_rate": 0.022146172768348793,
"loss": 2.9478,
"step": 2190000
},
{
"epoch": 22.5,
"learning_rate": 0.022017517677209525,
"loss": 2.9474,
"step": 2200000
},
{
"epoch": 22.6,
"learning_rate": 0.021888862586070257,
"loss": 2.9497,
"step": 2210000
},
{
"epoch": 22.7,
"learning_rate": 0.02176020749493099,
"loss": 2.9507,
"step": 2220000
},
{
"epoch": 22.81,
"learning_rate": 0.021631552403791724,
"loss": 2.9518,
"step": 2230000
},
{
"epoch": 22.91,
"learning_rate": 0.021502897312652456,
"loss": 2.9523,
"step": 2240000
},
{
"epoch": 23.01,
"learning_rate": 0.02137424222151319,
"loss": 2.951,
"step": 2250000
},
{
"epoch": 23.11,
"learning_rate": 0.021245587130373923,
"loss": 2.9394,
"step": 2260000
},
{
"epoch": 23.21,
"learning_rate": 0.02111693203923466,
"loss": 2.9426,
"step": 2270000
},
{
"epoch": 23.32,
"learning_rate": 0.020988276948095394,
"loss": 2.9425,
"step": 2280000
},
{
"epoch": 23.42,
"learning_rate": 0.020859621856956126,
"loss": 2.9449,
"step": 2290000
},
{
"epoch": 23.52,
"learning_rate": 0.020730966765816858,
"loss": 2.939,
"step": 2300000
},
{
"epoch": 23.62,
"learning_rate": 0.02060231167467759,
"loss": 2.939,
"step": 2310000
},
{
"epoch": 23.73,
"learning_rate": 0.020473656583538325,
"loss": 2.9414,
"step": 2320000
},
{
"epoch": 23.83,
"learning_rate": 0.020345001492399057,
"loss": 2.9433,
"step": 2330000
},
{
"epoch": 23.93,
"learning_rate": 0.020216346401259792,
"loss": 2.9436,
"step": 2340000
},
{
"epoch": 24.03,
"learning_rate": 0.020087691310120524,
"loss": 2.9421,
"step": 2350000
},
{
"epoch": 24.13,
"learning_rate": 0.01995903621898126,
"loss": 2.9385,
"step": 2360000
},
{
"epoch": 24.24,
"learning_rate": 0.019830381127841995,
"loss": 2.9413,
"step": 2370000
},
{
"epoch": 24.34,
"learning_rate": 0.019701726036702727,
"loss": 2.9426,
"step": 2380000
},
{
"epoch": 24.44,
"learning_rate": 0.01957307094556346,
"loss": 2.9423,
"step": 2390000
},
{
"epoch": 24.54,
"learning_rate": 0.01944441585442419,
"loss": 2.9442,
"step": 2400000
},
{
"epoch": 24.65,
"learning_rate": 0.019315760763284926,
"loss": 2.9457,
"step": 2410000
},
{
"epoch": 24.75,
"learning_rate": 0.019187105672145658,
"loss": 2.9455,
"step": 2420000
},
{
"epoch": 24.85,
"learning_rate": 0.019058450581006393,
"loss": 2.947,
"step": 2430000
},
{
"epoch": 24.95,
"learning_rate": 0.018929795489867125,
"loss": 2.9464,
"step": 2440000
},
{
"epoch": 25.06,
"learning_rate": 0.01880114039872786,
"loss": 2.9411,
"step": 2450000
},
{
"epoch": 25.16,
"learning_rate": 0.018672485307588593,
"loss": 2.9362,
"step": 2460000
},
{
"epoch": 25.26,
"learning_rate": 0.018543830216449324,
"loss": 2.9373,
"step": 2470000
},
{
"epoch": 25.36,
"learning_rate": 0.01841517512531006,
"loss": 2.9395,
"step": 2480000
},
{
"epoch": 25.46,
"learning_rate": 0.018286520034170792,
"loss": 2.9404,
"step": 2490000
},
{
"epoch": 25.57,
"learning_rate": 0.018157864943031527,
"loss": 2.9412,
"step": 2500000
},
{
"epoch": 25.67,
"learning_rate": 0.01802920985189226,
"loss": 2.9425,
"step": 2510000
},
{
"epoch": 25.77,
"learning_rate": 0.017900554760752994,
"loss": 2.943,
"step": 2520000
},
{
"epoch": 25.87,
"learning_rate": 0.017771899669613726,
"loss": 2.9446,
"step": 2530000
},
{
"epoch": 25.98,
"learning_rate": 0.01764324457847446,
"loss": 2.945,
"step": 2540000
},
{
"epoch": 26.08,
"learning_rate": 0.017514589487335194,
"loss": 2.9327,
"step": 2550000
},
{
"epoch": 26.18,
"learning_rate": 0.017385934396195925,
"loss": 2.9341,
"step": 2560000
},
{
"epoch": 26.28,
"learning_rate": 0.01725727930505666,
"loss": 2.9366,
"step": 2570000
},
{
"epoch": 26.38,
"learning_rate": 0.017128624213917393,
"loss": 2.9376,
"step": 2580000
},
{
"epoch": 26.49,
"learning_rate": 0.016999969122778128,
"loss": 2.9373,
"step": 2590000
},
{
"epoch": 26.59,
"learning_rate": 0.01687131403163886,
"loss": 2.9388,
"step": 2600000
},
{
"epoch": 26.69,
"learning_rate": 0.016742658940499595,
"loss": 2.9404,
"step": 2610000
},
{
"epoch": 26.79,
"learning_rate": 0.016614003849360327,
"loss": 2.9411,
"step": 2620000
},
{
"epoch": 26.9,
"learning_rate": 0.016485348758221063,
"loss": 2.943,
"step": 2630000
},
{
"epoch": 27.0,
"learning_rate": 0.016356693667081795,
"loss": 2.9421,
"step": 2640000
},
{
"epoch": 27.1,
"learning_rate": 0.016228038575942526,
"loss": 2.9313,
"step": 2650000
},
{
"epoch": 27.2,
"learning_rate": 0.016099383484803262,
"loss": 2.9337,
"step": 2660000
},
{
"epoch": 27.31,
"learning_rate": 0.015970728393663994,
"loss": 2.9341,
"step": 2670000
},
{
"epoch": 27.41,
"learning_rate": 0.01584207330252473,
"loss": 2.9353,
"step": 2680000
},
{
"epoch": 27.51,
"learning_rate": 0.01571341821138546,
"loss": 2.9359,
"step": 2690000
},
{
"epoch": 27.61,
"learning_rate": 0.015584763120246196,
"loss": 2.9363,
"step": 2700000
},
{
"epoch": 27.71,
"learning_rate": 0.015456108029106928,
"loss": 2.9387,
"step": 2710000
},
{
"epoch": 27.82,
"learning_rate": 0.015327452937967662,
"loss": 2.9388,
"step": 2720000
},
{
"epoch": 27.92,
"learning_rate": 0.015198797846828394,
"loss": 2.9399,
"step": 2730000
},
{
"epoch": 28.02,
"learning_rate": 0.01507014275568913,
"loss": 2.9384,
"step": 2740000
},
{
"epoch": 28.12,
"learning_rate": 0.014941487664549863,
"loss": 2.9305,
"step": 2750000
},
{
"epoch": 28.23,
"learning_rate": 0.014812832573410595,
"loss": 2.9325,
"step": 2760000
},
{
"epoch": 28.33,
"learning_rate": 0.01468417748227133,
"loss": 2.9332,
"step": 2770000
},
{
"epoch": 28.43,
"learning_rate": 0.014555522391132062,
"loss": 2.9339,
"step": 2780000
}
],
"max_steps": 3911360,
"num_train_epochs": 40,
"total_flos": 1.8137583487197538e+20,
"trial_name": null,
"trial_params": null
}