english_spelling_correction / trainer_state.json
Elalimy's picture
Upload 9 files
f766c9f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998235190305312,
"eval_steps": 500,
"global_step": 4249,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023530795929172306,
"grad_norm": 4.500667572021484,
"learning_rate": 0.0002992939515180042,
"loss": 2.9988,
"step": 10
},
{
"epoch": 0.004706159185834461,
"grad_norm": 2.7827706336975098,
"learning_rate": 0.00029858790303600844,
"loss": 1.0643,
"step": 20
},
{
"epoch": 0.007059238778751691,
"grad_norm": 2.746577739715576,
"learning_rate": 0.0002978818545540127,
"loss": 0.7978,
"step": 30
},
{
"epoch": 0.009412318371668922,
"grad_norm": 2.567692279815674,
"learning_rate": 0.0002971758060720169,
"loss": 0.6546,
"step": 40
},
{
"epoch": 0.011765397964586153,
"grad_norm": 2.2394003868103027,
"learning_rate": 0.00029646975759002115,
"loss": 0.5361,
"step": 50
},
{
"epoch": 0.014118477557503383,
"grad_norm": 2.1666100025177,
"learning_rate": 0.0002957637091080254,
"loss": 0.5137,
"step": 60
},
{
"epoch": 0.01647155715042061,
"grad_norm": 1.922058343887329,
"learning_rate": 0.0002950576606260296,
"loss": 0.446,
"step": 70
},
{
"epoch": 0.018824636743337845,
"grad_norm": 1.732611060142517,
"learning_rate": 0.00029435161214403386,
"loss": 0.4405,
"step": 80
},
{
"epoch": 0.021177716336255075,
"grad_norm": 2.2046239376068115,
"learning_rate": 0.0002936455636620381,
"loss": 0.3959,
"step": 90
},
{
"epoch": 0.023530795929172305,
"grad_norm": 2.083113670349121,
"learning_rate": 0.0002929395151800423,
"loss": 0.4114,
"step": 100
},
{
"epoch": 0.025883875522089535,
"grad_norm": 1.671247124671936,
"learning_rate": 0.00029223346669804656,
"loss": 0.3554,
"step": 110
},
{
"epoch": 0.028236955115006766,
"grad_norm": 2.001924514770508,
"learning_rate": 0.0002915274182160508,
"loss": 0.3577,
"step": 120
},
{
"epoch": 0.030590034707923996,
"grad_norm": 2.07259202003479,
"learning_rate": 0.00029082136973405503,
"loss": 0.3422,
"step": 130
},
{
"epoch": 0.03294311430084122,
"grad_norm": 1.7239247560501099,
"learning_rate": 0.00029011532125205927,
"loss": 0.3079,
"step": 140
},
{
"epoch": 0.03529619389375846,
"grad_norm": 1.7430157661437988,
"learning_rate": 0.0002894092727700635,
"loss": 0.3304,
"step": 150
},
{
"epoch": 0.03764927348667569,
"grad_norm": 1.1152617931365967,
"learning_rate": 0.00028870322428806774,
"loss": 0.3009,
"step": 160
},
{
"epoch": 0.04000235307959292,
"grad_norm": 1.5272759199142456,
"learning_rate": 0.00028799717580607197,
"loss": 0.3027,
"step": 170
},
{
"epoch": 0.04235543267251015,
"grad_norm": 1.3934285640716553,
"learning_rate": 0.0002872911273240762,
"loss": 0.2514,
"step": 180
},
{
"epoch": 0.04470851226542738,
"grad_norm": 1.7138372659683228,
"learning_rate": 0.00028658507884208044,
"loss": 0.2556,
"step": 190
},
{
"epoch": 0.04706159185834461,
"grad_norm": 1.7979109287261963,
"learning_rate": 0.00028587903036008473,
"loss": 0.2696,
"step": 200
},
{
"epoch": 0.04941467145126184,
"grad_norm": 1.342785358428955,
"learning_rate": 0.0002851729818780889,
"loss": 0.2496,
"step": 210
},
{
"epoch": 0.05176775104417907,
"grad_norm": 1.5516395568847656,
"learning_rate": 0.0002844669333960932,
"loss": 0.2727,
"step": 220
},
{
"epoch": 0.0541208306370963,
"grad_norm": 6.922358989715576,
"learning_rate": 0.0002837608849140974,
"loss": 0.2492,
"step": 230
},
{
"epoch": 0.05647391023001353,
"grad_norm": 1.5551228523254395,
"learning_rate": 0.00028305483643210167,
"loss": 0.2451,
"step": 240
},
{
"epoch": 0.05882698982293076,
"grad_norm": 1.300445318222046,
"learning_rate": 0.00028234878795010585,
"loss": 0.253,
"step": 250
},
{
"epoch": 0.06118006941584799,
"grad_norm": 1.4355467557907104,
"learning_rate": 0.00028164273946811014,
"loss": 0.2453,
"step": 260
},
{
"epoch": 0.06353314900876522,
"grad_norm": 15.9704008102417,
"learning_rate": 0.0002809366909861143,
"loss": 0.2072,
"step": 270
},
{
"epoch": 0.06588622860168244,
"grad_norm": 1.7124171257019043,
"learning_rate": 0.0002802306425041186,
"loss": 0.4367,
"step": 280
},
{
"epoch": 0.06823930819459968,
"grad_norm": 1.6787582635879517,
"learning_rate": 0.0002795245940221228,
"loss": 0.2716,
"step": 290
},
{
"epoch": 0.07059238778751692,
"grad_norm": 1.2618638277053833,
"learning_rate": 0.0002788185455401271,
"loss": 0.2821,
"step": 300
},
{
"epoch": 0.07294546738043414,
"grad_norm": 2.927347421646118,
"learning_rate": 0.00027811249705813126,
"loss": 0.2712,
"step": 310
},
{
"epoch": 0.07529854697335138,
"grad_norm": 1.9304898977279663,
"learning_rate": 0.00027740644857613555,
"loss": 0.2637,
"step": 320
},
{
"epoch": 0.0776516265662686,
"grad_norm": 1.2599807977676392,
"learning_rate": 0.0002767004000941398,
"loss": 0.2389,
"step": 330
},
{
"epoch": 0.08000470615918584,
"grad_norm": 1.4264953136444092,
"learning_rate": 0.000275994351612144,
"loss": 0.2143,
"step": 340
},
{
"epoch": 0.08235778575210306,
"grad_norm": 1.4229093790054321,
"learning_rate": 0.00027528830313014826,
"loss": 0.2571,
"step": 350
},
{
"epoch": 0.0847108653450203,
"grad_norm": 1.743034839630127,
"learning_rate": 0.0002745822546481525,
"loss": 0.2292,
"step": 360
},
{
"epoch": 0.08706394493793752,
"grad_norm": 1.3582898378372192,
"learning_rate": 0.00027387620616615673,
"loss": 0.2314,
"step": 370
},
{
"epoch": 0.08941702453085476,
"grad_norm": 1.2714539766311646,
"learning_rate": 0.00027317015768416096,
"loss": 0.2694,
"step": 380
},
{
"epoch": 0.09177010412377198,
"grad_norm": 1.0213568210601807,
"learning_rate": 0.0002724641092021652,
"loss": 0.2269,
"step": 390
},
{
"epoch": 0.09412318371668922,
"grad_norm": 0.8783596754074097,
"learning_rate": 0.00027175806072016943,
"loss": 0.2488,
"step": 400
},
{
"epoch": 0.09647626330960644,
"grad_norm": 1.842328667640686,
"learning_rate": 0.00027105201223817367,
"loss": 0.2175,
"step": 410
},
{
"epoch": 0.09882934290252368,
"grad_norm": 1.4185247421264648,
"learning_rate": 0.0002703459637561779,
"loss": 0.2049,
"step": 420
},
{
"epoch": 0.1011824224954409,
"grad_norm": 1.3057924509048462,
"learning_rate": 0.00026963991527418214,
"loss": 0.1819,
"step": 430
},
{
"epoch": 0.10353550208835814,
"grad_norm": 1.563916802406311,
"learning_rate": 0.0002689338667921864,
"loss": 0.2042,
"step": 440
},
{
"epoch": 0.10588858168127536,
"grad_norm": 0.9588648080825806,
"learning_rate": 0.0002682278183101906,
"loss": 0.1977,
"step": 450
},
{
"epoch": 0.1082416612741926,
"grad_norm": 1.3258203268051147,
"learning_rate": 0.00026752176982819485,
"loss": 0.1984,
"step": 460
},
{
"epoch": 0.11059474086710983,
"grad_norm": 1.6783477067947388,
"learning_rate": 0.0002668157213461991,
"loss": 0.2086,
"step": 470
},
{
"epoch": 0.11294782046002706,
"grad_norm": 1.820469617843628,
"learning_rate": 0.0002661096728642033,
"loss": 0.2128,
"step": 480
},
{
"epoch": 0.11530090005294429,
"grad_norm": 1.1493395566940308,
"learning_rate": 0.00026540362438220755,
"loss": 0.2022,
"step": 490
},
{
"epoch": 0.11765397964586152,
"grad_norm": 1.3134245872497559,
"learning_rate": 0.0002646975759002118,
"loss": 0.2199,
"step": 500
},
{
"epoch": 0.12000705923877875,
"grad_norm": 4.0345988273620605,
"learning_rate": 0.000263991527418216,
"loss": 0.2181,
"step": 510
},
{
"epoch": 0.12236013883169598,
"grad_norm": 1.7995352745056152,
"learning_rate": 0.00026328547893622026,
"loss": 0.2233,
"step": 520
},
{
"epoch": 0.1247132184246132,
"grad_norm": 1.6303889751434326,
"learning_rate": 0.0002625794304542245,
"loss": 0.2161,
"step": 530
},
{
"epoch": 0.12706629801753044,
"grad_norm": 1.1785417795181274,
"learning_rate": 0.0002618733819722287,
"loss": 0.1952,
"step": 540
},
{
"epoch": 0.12941937761044767,
"grad_norm": 2.5084645748138428,
"learning_rate": 0.00026116733349023296,
"loss": 0.204,
"step": 550
},
{
"epoch": 0.1317724572033649,
"grad_norm": 0.9784884452819824,
"learning_rate": 0.0002604612850082372,
"loss": 0.1598,
"step": 560
},
{
"epoch": 0.13412553679628214,
"grad_norm": 0.8020937442779541,
"learning_rate": 0.00025975523652624143,
"loss": 0.2045,
"step": 570
},
{
"epoch": 0.13647861638919936,
"grad_norm": 0.9151997566223145,
"learning_rate": 0.00025904918804424567,
"loss": 0.1744,
"step": 580
},
{
"epoch": 0.1388316959821166,
"grad_norm": 1.55955171585083,
"learning_rate": 0.0002583431395622499,
"loss": 0.1878,
"step": 590
},
{
"epoch": 0.14118477557503384,
"grad_norm": 23.52423858642578,
"learning_rate": 0.0002576370910802542,
"loss": 0.1779,
"step": 600
},
{
"epoch": 0.14353785516795106,
"grad_norm": 1.1516189575195312,
"learning_rate": 0.0002569310425982584,
"loss": 0.195,
"step": 610
},
{
"epoch": 0.14589093476086828,
"grad_norm": 1.0912541151046753,
"learning_rate": 0.00025622499411626266,
"loss": 0.205,
"step": 620
},
{
"epoch": 0.1482440143537855,
"grad_norm": 1.2680310010910034,
"learning_rate": 0.00025551894563426684,
"loss": 0.1654,
"step": 630
},
{
"epoch": 0.15059709394670276,
"grad_norm": 1.2099932432174683,
"learning_rate": 0.00025481289715227113,
"loss": 0.1698,
"step": 640
},
{
"epoch": 0.15295017353961998,
"grad_norm": 1.1155511140823364,
"learning_rate": 0.0002541068486702753,
"loss": 0.1883,
"step": 650
},
{
"epoch": 0.1553032531325372,
"grad_norm": 1.2237110137939453,
"learning_rate": 0.0002534008001882796,
"loss": 0.1739,
"step": 660
},
{
"epoch": 0.15765633272545443,
"grad_norm": 2.2334392070770264,
"learning_rate": 0.0002526947517062838,
"loss": 0.2149,
"step": 670
},
{
"epoch": 0.16000941231837168,
"grad_norm": 1.0051536560058594,
"learning_rate": 0.0002519887032242881,
"loss": 0.1755,
"step": 680
},
{
"epoch": 0.1623624919112889,
"grad_norm": 1.5381518602371216,
"learning_rate": 0.00025128265474229225,
"loss": 0.1814,
"step": 690
},
{
"epoch": 0.16471557150420613,
"grad_norm": 1.3390990495681763,
"learning_rate": 0.00025057660626029654,
"loss": 0.1866,
"step": 700
},
{
"epoch": 0.16706865109712335,
"grad_norm": 1.4517531394958496,
"learning_rate": 0.0002498705577783007,
"loss": 0.1928,
"step": 710
},
{
"epoch": 0.1694217306900406,
"grad_norm": 1.4081028699874878,
"learning_rate": 0.000249164509296305,
"loss": 0.1714,
"step": 720
},
{
"epoch": 0.17177481028295782,
"grad_norm": 1.357934832572937,
"learning_rate": 0.00024845846081430925,
"loss": 0.1856,
"step": 730
},
{
"epoch": 0.17412788987587505,
"grad_norm": 1.043090581893921,
"learning_rate": 0.0002477524123323135,
"loss": 0.1495,
"step": 740
},
{
"epoch": 0.17648096946879227,
"grad_norm": 1.2053163051605225,
"learning_rate": 0.0002470463638503177,
"loss": 0.1517,
"step": 750
},
{
"epoch": 0.17883404906170952,
"grad_norm": 2.3474409580230713,
"learning_rate": 0.00024634031536832195,
"loss": 0.1882,
"step": 760
},
{
"epoch": 0.18118712865462674,
"grad_norm": 0.8380926847457886,
"learning_rate": 0.0002456342668863262,
"loss": 0.1796,
"step": 770
},
{
"epoch": 0.18354020824754397,
"grad_norm": 1.3997254371643066,
"learning_rate": 0.0002449282184043304,
"loss": 0.1843,
"step": 780
},
{
"epoch": 0.1858932878404612,
"grad_norm": 1.3143609762191772,
"learning_rate": 0.00024422216992233466,
"loss": 0.1629,
"step": 790
},
{
"epoch": 0.18824636743337844,
"grad_norm": 0.9414114952087402,
"learning_rate": 0.0002435161214403389,
"loss": 0.1583,
"step": 800
},
{
"epoch": 0.19059944702629567,
"grad_norm": 1.0523838996887207,
"learning_rate": 0.0002428100729583431,
"loss": 0.194,
"step": 810
},
{
"epoch": 0.1929525266192129,
"grad_norm": 1.0871750116348267,
"learning_rate": 0.00024210402447634737,
"loss": 0.15,
"step": 820
},
{
"epoch": 0.1953056062121301,
"grad_norm": 1.515932321548462,
"learning_rate": 0.0002413979759943516,
"loss": 0.1895,
"step": 830
},
{
"epoch": 0.19765868580504736,
"grad_norm": 0.7211456298828125,
"learning_rate": 0.00024069192751235584,
"loss": 0.1685,
"step": 840
},
{
"epoch": 0.20001176539796459,
"grad_norm": 0.7664592862129211,
"learning_rate": 0.00023998587903036007,
"loss": 0.1658,
"step": 850
},
{
"epoch": 0.2023648449908818,
"grad_norm": 0.8728657960891724,
"learning_rate": 0.0002392798305483643,
"loss": 0.1468,
"step": 860
},
{
"epoch": 0.20471792458379906,
"grad_norm": 1.3027325868606567,
"learning_rate": 0.00023857378206636854,
"loss": 0.1633,
"step": 870
},
{
"epoch": 0.20707100417671628,
"grad_norm": 1.1061084270477295,
"learning_rate": 0.00023786773358437278,
"loss": 0.1861,
"step": 880
},
{
"epoch": 0.2094240837696335,
"grad_norm": 1.176365613937378,
"learning_rate": 0.000237161685102377,
"loss": 0.1589,
"step": 890
},
{
"epoch": 0.21177716336255073,
"grad_norm": 0.8307468295097351,
"learning_rate": 0.00023645563662038127,
"loss": 0.172,
"step": 900
},
{
"epoch": 0.21413024295546798,
"grad_norm": 1.2759816646575928,
"learning_rate": 0.00023574958813838548,
"loss": 0.1475,
"step": 910
},
{
"epoch": 0.2164833225483852,
"grad_norm": 1.661071538925171,
"learning_rate": 0.00023504353965638974,
"loss": 0.2048,
"step": 920
},
{
"epoch": 0.21883640214130243,
"grad_norm": 1.3144210577011108,
"learning_rate": 0.00023433749117439395,
"loss": 0.1582,
"step": 930
},
{
"epoch": 0.22118948173421965,
"grad_norm": 1.1830146312713623,
"learning_rate": 0.00023363144269239821,
"loss": 0.1567,
"step": 940
},
{
"epoch": 0.2235425613271369,
"grad_norm": 0.7755473256111145,
"learning_rate": 0.00023292539421040242,
"loss": 0.1369,
"step": 950
},
{
"epoch": 0.22589564092005412,
"grad_norm": 0.708152711391449,
"learning_rate": 0.00023221934572840666,
"loss": 0.1477,
"step": 960
},
{
"epoch": 0.22824872051297135,
"grad_norm": 0.9567592144012451,
"learning_rate": 0.0002315132972464109,
"loss": 0.1685,
"step": 970
},
{
"epoch": 0.23060180010588857,
"grad_norm": 1.019717812538147,
"learning_rate": 0.00023080724876441513,
"loss": 0.1485,
"step": 980
},
{
"epoch": 0.23295487969880582,
"grad_norm": 3.704050302505493,
"learning_rate": 0.00023010120028241936,
"loss": 0.182,
"step": 990
},
{
"epoch": 0.23530795929172305,
"grad_norm": 1.3113001585006714,
"learning_rate": 0.0002293951518004236,
"loss": 0.2033,
"step": 1000
},
{
"epoch": 0.23766103888464027,
"grad_norm": 1.586300253868103,
"learning_rate": 0.00022868910331842783,
"loss": 0.1658,
"step": 1010
},
{
"epoch": 0.2400141184775575,
"grad_norm": 0.7029755711555481,
"learning_rate": 0.00022798305483643207,
"loss": 0.1675,
"step": 1020
},
{
"epoch": 0.24236719807047474,
"grad_norm": 0.9558175802230835,
"learning_rate": 0.00022727700635443633,
"loss": 0.1292,
"step": 1030
},
{
"epoch": 0.24472027766339197,
"grad_norm": 0.552598774433136,
"learning_rate": 0.00022657095787244054,
"loss": 0.1271,
"step": 1040
},
{
"epoch": 0.2470733572563092,
"grad_norm": 1.160657525062561,
"learning_rate": 0.0002258649093904448,
"loss": 0.1406,
"step": 1050
},
{
"epoch": 0.2494264368492264,
"grad_norm": 0.9359754323959351,
"learning_rate": 0.000225158860908449,
"loss": 0.1456,
"step": 1060
},
{
"epoch": 0.25177951644214364,
"grad_norm": 0.6799198985099792,
"learning_rate": 0.00022445281242645327,
"loss": 0.1235,
"step": 1070
},
{
"epoch": 0.2541325960350609,
"grad_norm": 0.97700434923172,
"learning_rate": 0.00022374676394445748,
"loss": 0.1721,
"step": 1080
},
{
"epoch": 0.25648567562797814,
"grad_norm": 0.6762118935585022,
"learning_rate": 0.00022304071546246174,
"loss": 0.2006,
"step": 1090
},
{
"epoch": 0.25883875522089533,
"grad_norm": 0.6071228384971619,
"learning_rate": 0.00022233466698046595,
"loss": 0.1647,
"step": 1100
},
{
"epoch": 0.2611918348138126,
"grad_norm": 0.7097590565681458,
"learning_rate": 0.0002216286184984702,
"loss": 0.1662,
"step": 1110
},
{
"epoch": 0.2635449144067298,
"grad_norm": 0.48786184191703796,
"learning_rate": 0.00022092257001647442,
"loss": 0.1378,
"step": 1120
},
{
"epoch": 0.26589799399964703,
"grad_norm": 0.7238913178443909,
"learning_rate": 0.00022021652153447868,
"loss": 0.163,
"step": 1130
},
{
"epoch": 0.2682510735925643,
"grad_norm": 1.3571726083755493,
"learning_rate": 0.00021951047305248292,
"loss": 0.1545,
"step": 1140
},
{
"epoch": 0.2706041531854815,
"grad_norm": 0.6683372259140015,
"learning_rate": 0.00021880442457048715,
"loss": 0.1375,
"step": 1150
},
{
"epoch": 0.27295723277839873,
"grad_norm": 1.9159690141677856,
"learning_rate": 0.0002180983760884914,
"loss": 0.1604,
"step": 1160
},
{
"epoch": 0.275310312371316,
"grad_norm": 1.6136759519577026,
"learning_rate": 0.00021739232760649562,
"loss": 0.1827,
"step": 1170
},
{
"epoch": 0.2776633919642332,
"grad_norm": 1.2445416450500488,
"learning_rate": 0.00021668627912449986,
"loss": 0.1283,
"step": 1180
},
{
"epoch": 0.2800164715571504,
"grad_norm": 1.143410563468933,
"learning_rate": 0.0002159802306425041,
"loss": 0.1571,
"step": 1190
},
{
"epoch": 0.2823695511500677,
"grad_norm": 0.641952633857727,
"learning_rate": 0.00021527418216050833,
"loss": 0.1511,
"step": 1200
},
{
"epoch": 0.2847226307429849,
"grad_norm": 0.9618122577667236,
"learning_rate": 0.00021456813367851256,
"loss": 0.1251,
"step": 1210
},
{
"epoch": 0.2870757103359021,
"grad_norm": 1.040390133857727,
"learning_rate": 0.0002138620851965168,
"loss": 0.1481,
"step": 1220
},
{
"epoch": 0.2894287899288193,
"grad_norm": 2.470360279083252,
"learning_rate": 0.00021315603671452106,
"loss": 0.1523,
"step": 1230
},
{
"epoch": 0.29178186952173657,
"grad_norm": 1.15378737449646,
"learning_rate": 0.00021244998823252527,
"loss": 0.1526,
"step": 1240
},
{
"epoch": 0.2941349491146538,
"grad_norm": 1.2236779928207397,
"learning_rate": 0.00021174393975052953,
"loss": 0.15,
"step": 1250
},
{
"epoch": 0.296488028707571,
"grad_norm": 0.6974225640296936,
"learning_rate": 0.00021103789126853374,
"loss": 0.1529,
"step": 1260
},
{
"epoch": 0.29884110830048827,
"grad_norm": 1.2019627094268799,
"learning_rate": 0.000210331842786538,
"loss": 0.1534,
"step": 1270
},
{
"epoch": 0.3011941878934055,
"grad_norm": 1.5245829820632935,
"learning_rate": 0.0002096257943045422,
"loss": 0.1452,
"step": 1280
},
{
"epoch": 0.3035472674863227,
"grad_norm": 1.5062931776046753,
"learning_rate": 0.00020891974582254647,
"loss": 0.1617,
"step": 1290
},
{
"epoch": 0.30590034707923996,
"grad_norm": 0.5989176034927368,
"learning_rate": 0.00020821369734055068,
"loss": 0.1567,
"step": 1300
},
{
"epoch": 0.30825342667215716,
"grad_norm": 1.1063286066055298,
"learning_rate": 0.00020750764885855494,
"loss": 0.1651,
"step": 1310
},
{
"epoch": 0.3106065062650744,
"grad_norm": 0.9815717935562134,
"learning_rate": 0.00020680160037655915,
"loss": 0.1485,
"step": 1320
},
{
"epoch": 0.31295958585799166,
"grad_norm": 1.218807578086853,
"learning_rate": 0.0002060955518945634,
"loss": 0.1151,
"step": 1330
},
{
"epoch": 0.31531266545090886,
"grad_norm": 1.1629014015197754,
"learning_rate": 0.00020538950341256765,
"loss": 0.1406,
"step": 1340
},
{
"epoch": 0.3176657450438261,
"grad_norm": 0.6818956732749939,
"learning_rate": 0.00020468345493057188,
"loss": 0.1465,
"step": 1350
},
{
"epoch": 0.32001882463674336,
"grad_norm": 0.7869308590888977,
"learning_rate": 0.00020397740644857612,
"loss": 0.1515,
"step": 1360
},
{
"epoch": 0.32237190422966056,
"grad_norm": 1.023478627204895,
"learning_rate": 0.00020327135796658035,
"loss": 0.1781,
"step": 1370
},
{
"epoch": 0.3247249838225778,
"grad_norm": 1.0383384227752686,
"learning_rate": 0.0002025653094845846,
"loss": 0.1195,
"step": 1380
},
{
"epoch": 0.327078063415495,
"grad_norm": 1.5291595458984375,
"learning_rate": 0.00020185926100258882,
"loss": 0.1334,
"step": 1390
},
{
"epoch": 0.32943114300841225,
"grad_norm": 0.9488996267318726,
"learning_rate": 0.00020115321252059306,
"loss": 0.1368,
"step": 1400
},
{
"epoch": 0.3317842226013295,
"grad_norm": 1.1703331470489502,
"learning_rate": 0.0002004471640385973,
"loss": 0.131,
"step": 1410
},
{
"epoch": 0.3341373021942467,
"grad_norm": 0.6122885346412659,
"learning_rate": 0.00019974111555660153,
"loss": 0.1356,
"step": 1420
},
{
"epoch": 0.33649038178716395,
"grad_norm": 0.7869921326637268,
"learning_rate": 0.0001990350670746058,
"loss": 0.1817,
"step": 1430
},
{
"epoch": 0.3388434613800812,
"grad_norm": 0.691066324710846,
"learning_rate": 0.00019832901859261,
"loss": 0.131,
"step": 1440
},
{
"epoch": 0.3411965409729984,
"grad_norm": 1.4205127954483032,
"learning_rate": 0.00019762297011061426,
"loss": 0.1366,
"step": 1450
},
{
"epoch": 0.34354962056591565,
"grad_norm": 0.47127053141593933,
"learning_rate": 0.00019691692162861847,
"loss": 0.1498,
"step": 1460
},
{
"epoch": 0.3459027001588329,
"grad_norm": 0.9336820840835571,
"learning_rate": 0.00019621087314662273,
"loss": 0.1512,
"step": 1470
},
{
"epoch": 0.3482557797517501,
"grad_norm": 0.8124200105667114,
"learning_rate": 0.00019550482466462694,
"loss": 0.1319,
"step": 1480
},
{
"epoch": 0.35060885934466735,
"grad_norm": 0.6921178698539734,
"learning_rate": 0.0001947987761826312,
"loss": 0.1279,
"step": 1490
},
{
"epoch": 0.35296193893758454,
"grad_norm": 1.336229681968689,
"learning_rate": 0.0001940927277006354,
"loss": 0.1251,
"step": 1500
},
{
"epoch": 0.3553150185305018,
"grad_norm": 0.9984803795814514,
"learning_rate": 0.00019338667921863967,
"loss": 0.1299,
"step": 1510
},
{
"epoch": 0.35766809812341904,
"grad_norm": 1.0903042554855347,
"learning_rate": 0.00019268063073664388,
"loss": 0.1528,
"step": 1520
},
{
"epoch": 0.36002117771633624,
"grad_norm": 0.666950523853302,
"learning_rate": 0.00019197458225464814,
"loss": 0.1446,
"step": 1530
},
{
"epoch": 0.3623742573092535,
"grad_norm": 0.8104845285415649,
"learning_rate": 0.00019126853377265238,
"loss": 0.1221,
"step": 1540
},
{
"epoch": 0.36472733690217074,
"grad_norm": 0.5904582738876343,
"learning_rate": 0.00019056248529065661,
"loss": 0.1164,
"step": 1550
},
{
"epoch": 0.36708041649508794,
"grad_norm": 0.7703972458839417,
"learning_rate": 0.00018985643680866085,
"loss": 0.0978,
"step": 1560
},
{
"epoch": 0.3694334960880052,
"grad_norm": 1.9245415925979614,
"learning_rate": 0.00018915038832666508,
"loss": 0.1624,
"step": 1570
},
{
"epoch": 0.3717865756809224,
"grad_norm": 1.6459194421768188,
"learning_rate": 0.00018844433984466932,
"loss": 0.1289,
"step": 1580
},
{
"epoch": 0.37413965527383963,
"grad_norm": 1.6774044036865234,
"learning_rate": 0.00018773829136267355,
"loss": 0.1468,
"step": 1590
},
{
"epoch": 0.3764927348667569,
"grad_norm": 1.5878580808639526,
"learning_rate": 0.0001870322428806778,
"loss": 0.1318,
"step": 1600
},
{
"epoch": 0.3788458144596741,
"grad_norm": 0.7039738297462463,
"learning_rate": 0.00018632619439868203,
"loss": 0.1242,
"step": 1610
},
{
"epoch": 0.38119889405259133,
"grad_norm": 1.1770200729370117,
"learning_rate": 0.00018562014591668626,
"loss": 0.1321,
"step": 1620
},
{
"epoch": 0.3835519736455086,
"grad_norm": 2.2201638221740723,
"learning_rate": 0.00018491409743469052,
"loss": 0.1214,
"step": 1630
},
{
"epoch": 0.3859050532384258,
"grad_norm": 0.756149411201477,
"learning_rate": 0.00018420804895269473,
"loss": 0.1219,
"step": 1640
},
{
"epoch": 0.38825813283134303,
"grad_norm": 0.5444088578224182,
"learning_rate": 0.000183502000470699,
"loss": 0.1346,
"step": 1650
},
{
"epoch": 0.3906112124242602,
"grad_norm": 0.7643070816993713,
"learning_rate": 0.0001827959519887032,
"loss": 0.1324,
"step": 1660
},
{
"epoch": 0.3929642920171775,
"grad_norm": 0.885362446308136,
"learning_rate": 0.00018208990350670746,
"loss": 0.1166,
"step": 1670
},
{
"epoch": 0.3953173716100947,
"grad_norm": 0.7135679721832275,
"learning_rate": 0.00018138385502471167,
"loss": 0.1364,
"step": 1680
},
{
"epoch": 0.3976704512030119,
"grad_norm": 0.5533025860786438,
"learning_rate": 0.00018067780654271593,
"loss": 0.1137,
"step": 1690
},
{
"epoch": 0.40002353079592917,
"grad_norm": 0.5916281342506409,
"learning_rate": 0.00017997175806072014,
"loss": 0.1131,
"step": 1700
},
{
"epoch": 0.4023766103888464,
"grad_norm": 0.8299354314804077,
"learning_rate": 0.0001792657095787244,
"loss": 0.1331,
"step": 1710
},
{
"epoch": 0.4047296899817636,
"grad_norm": 0.7944399118423462,
"learning_rate": 0.0001785596610967286,
"loss": 0.1049,
"step": 1720
},
{
"epoch": 0.40708276957468087,
"grad_norm": 0.6967952251434326,
"learning_rate": 0.00017785361261473287,
"loss": 0.0997,
"step": 1730
},
{
"epoch": 0.4094358491675981,
"grad_norm": 0.42431318759918213,
"learning_rate": 0.0001771475641327371,
"loss": 0.0964,
"step": 1740
},
{
"epoch": 0.4117889287605153,
"grad_norm": 0.6767364740371704,
"learning_rate": 0.00017644151565074134,
"loss": 0.1627,
"step": 1750
},
{
"epoch": 0.41414200835343257,
"grad_norm": 1.0430301427841187,
"learning_rate": 0.00017573546716874558,
"loss": 0.1173,
"step": 1760
},
{
"epoch": 0.41649508794634976,
"grad_norm": 0.6168161034584045,
"learning_rate": 0.00017502941868674981,
"loss": 0.1229,
"step": 1770
},
{
"epoch": 0.418848167539267,
"grad_norm": 1.9067519903182983,
"learning_rate": 0.00017432337020475405,
"loss": 0.1369,
"step": 1780
},
{
"epoch": 0.42120124713218426,
"grad_norm": 1.5157831907272339,
"learning_rate": 0.00017361732172275829,
"loss": 0.1243,
"step": 1790
},
{
"epoch": 0.42355432672510146,
"grad_norm": 1.5152102708816528,
"learning_rate": 0.00017291127324076252,
"loss": 0.1395,
"step": 1800
},
{
"epoch": 0.4259074063180187,
"grad_norm": 0.8262742161750793,
"learning_rate": 0.00017220522475876676,
"loss": 0.1467,
"step": 1810
},
{
"epoch": 0.42826048591093596,
"grad_norm": 0.5484256744384766,
"learning_rate": 0.000171499176276771,
"loss": 0.1405,
"step": 1820
},
{
"epoch": 0.43061356550385316,
"grad_norm": 0.7796267867088318,
"learning_rate": 0.00017079312779477525,
"loss": 0.1508,
"step": 1830
},
{
"epoch": 0.4329666450967704,
"grad_norm": 0.7360082268714905,
"learning_rate": 0.00017008707931277946,
"loss": 0.1332,
"step": 1840
},
{
"epoch": 0.4353197246896876,
"grad_norm": 0.8352281451225281,
"learning_rate": 0.00016938103083078372,
"loss": 0.1343,
"step": 1850
},
{
"epoch": 0.43767280428260485,
"grad_norm": 0.6898388862609863,
"learning_rate": 0.00016867498234878793,
"loss": 0.0983,
"step": 1860
},
{
"epoch": 0.4400258838755221,
"grad_norm": 0.3843238651752472,
"learning_rate": 0.0001679689338667922,
"loss": 0.1091,
"step": 1870
},
{
"epoch": 0.4423789634684393,
"grad_norm": 0.7791532278060913,
"learning_rate": 0.0001672628853847964,
"loss": 0.1321,
"step": 1880
},
{
"epoch": 0.44473204306135655,
"grad_norm": 0.9906323552131653,
"learning_rate": 0.00016655683690280064,
"loss": 0.1125,
"step": 1890
},
{
"epoch": 0.4470851226542738,
"grad_norm": 0.631594181060791,
"learning_rate": 0.00016585078842080487,
"loss": 0.1328,
"step": 1900
},
{
"epoch": 0.449438202247191,
"grad_norm": 1.4922380447387695,
"learning_rate": 0.0001651447399388091,
"loss": 0.1441,
"step": 1910
},
{
"epoch": 0.45179128184010825,
"grad_norm": 0.6896445751190186,
"learning_rate": 0.00016443869145681334,
"loss": 0.146,
"step": 1920
},
{
"epoch": 0.45414436143302545,
"grad_norm": 0.6470409035682678,
"learning_rate": 0.00016373264297481758,
"loss": 0.1123,
"step": 1930
},
{
"epoch": 0.4564974410259427,
"grad_norm": 1.4532804489135742,
"learning_rate": 0.00016302659449282184,
"loss": 0.1204,
"step": 1940
},
{
"epoch": 0.45885052061885995,
"grad_norm": 1.5582534074783325,
"learning_rate": 0.00016232054601082605,
"loss": 0.1275,
"step": 1950
},
{
"epoch": 0.46120360021177714,
"grad_norm": 0.7568921446800232,
"learning_rate": 0.0001616144975288303,
"loss": 0.1373,
"step": 1960
},
{
"epoch": 0.4635566798046944,
"grad_norm": 0.7904714941978455,
"learning_rate": 0.00016090844904683452,
"loss": 0.1281,
"step": 1970
},
{
"epoch": 0.46590975939761164,
"grad_norm": 0.48104897141456604,
"learning_rate": 0.00016020240056483878,
"loss": 0.1173,
"step": 1980
},
{
"epoch": 0.46826283899052884,
"grad_norm": 0.6676899194717407,
"learning_rate": 0.000159496352082843,
"loss": 0.1297,
"step": 1990
},
{
"epoch": 0.4706159185834461,
"grad_norm": 0.7035501599311829,
"learning_rate": 0.00015879030360084725,
"loss": 0.1246,
"step": 2000
},
{
"epoch": 0.47296899817636334,
"grad_norm": 1.289421796798706,
"learning_rate": 0.00015808425511885146,
"loss": 0.1501,
"step": 2010
},
{
"epoch": 0.47532207776928054,
"grad_norm": 0.6186831593513489,
"learning_rate": 0.00015737820663685572,
"loss": 0.1156,
"step": 2020
},
{
"epoch": 0.4776751573621978,
"grad_norm": 0.7897233963012695,
"learning_rate": 0.00015667215815485993,
"loss": 0.1317,
"step": 2030
},
{
"epoch": 0.480028236955115,
"grad_norm": 1.1652599573135376,
"learning_rate": 0.0001559661096728642,
"loss": 0.1325,
"step": 2040
},
{
"epoch": 0.48238131654803224,
"grad_norm": 0.6400769948959351,
"learning_rate": 0.0001552600611908684,
"loss": 0.1002,
"step": 2050
},
{
"epoch": 0.4847343961409495,
"grad_norm": 0.5541133880615234,
"learning_rate": 0.00015455401270887266,
"loss": 0.1232,
"step": 2060
},
{
"epoch": 0.4870874757338667,
"grad_norm": 0.605411946773529,
"learning_rate": 0.0001538479642268769,
"loss": 0.1102,
"step": 2070
},
{
"epoch": 0.48944055532678393,
"grad_norm": 0.49058374762535095,
"learning_rate": 0.00015314191574488113,
"loss": 0.1228,
"step": 2080
},
{
"epoch": 0.4917936349197012,
"grad_norm": 0.7565241456031799,
"learning_rate": 0.00015243586726288537,
"loss": 0.1009,
"step": 2090
},
{
"epoch": 0.4941467145126184,
"grad_norm": 0.4517477750778198,
"learning_rate": 0.0001517298187808896,
"loss": 0.129,
"step": 2100
},
{
"epoch": 0.49649979410553563,
"grad_norm": 0.7871853709220886,
"learning_rate": 0.00015102377029889384,
"loss": 0.1049,
"step": 2110
},
{
"epoch": 0.4988528736984528,
"grad_norm": 0.4314168691635132,
"learning_rate": 0.00015031772181689807,
"loss": 0.117,
"step": 2120
},
{
"epoch": 0.5012059532913701,
"grad_norm": 0.8347052335739136,
"learning_rate": 0.0001496116733349023,
"loss": 0.1336,
"step": 2130
},
{
"epoch": 0.5035590328842873,
"grad_norm": 0.42039480805397034,
"learning_rate": 0.00014890562485290657,
"loss": 0.1176,
"step": 2140
},
{
"epoch": 0.5059121124772046,
"grad_norm": 1.1371684074401855,
"learning_rate": 0.0001481995763709108,
"loss": 0.1362,
"step": 2150
},
{
"epoch": 0.5082651920701218,
"grad_norm": 0.8690921664237976,
"learning_rate": 0.00014749352788891504,
"loss": 0.1444,
"step": 2160
},
{
"epoch": 0.510618271663039,
"grad_norm": 0.3952578604221344,
"learning_rate": 0.00014678747940691928,
"loss": 0.1411,
"step": 2170
},
{
"epoch": 0.5129713512559563,
"grad_norm": 1.0104624032974243,
"learning_rate": 0.0001460814309249235,
"loss": 0.1127,
"step": 2180
},
{
"epoch": 0.5153244308488735,
"grad_norm": 0.7708560824394226,
"learning_rate": 0.00014537538244292775,
"loss": 0.1364,
"step": 2190
},
{
"epoch": 0.5176775104417907,
"grad_norm": 3.323113203048706,
"learning_rate": 0.00014466933396093198,
"loss": 0.1326,
"step": 2200
},
{
"epoch": 0.520030590034708,
"grad_norm": 0.5021951198577881,
"learning_rate": 0.00014396328547893622,
"loss": 0.1075,
"step": 2210
},
{
"epoch": 0.5223836696276252,
"grad_norm": 0.5558544397354126,
"learning_rate": 0.00014325723699694045,
"loss": 0.1019,
"step": 2220
},
{
"epoch": 0.5247367492205424,
"grad_norm": 0.7476164102554321,
"learning_rate": 0.0001425511885149447,
"loss": 0.108,
"step": 2230
},
{
"epoch": 0.5270898288134596,
"grad_norm": 0.8783542513847351,
"learning_rate": 0.00014184514003294892,
"loss": 0.1182,
"step": 2240
},
{
"epoch": 0.5294429084063769,
"grad_norm": 0.5716719627380371,
"learning_rate": 0.00014113909155095316,
"loss": 0.1048,
"step": 2250
},
{
"epoch": 0.5317959879992941,
"grad_norm": 0.41919055581092834,
"learning_rate": 0.0001404330430689574,
"loss": 0.1071,
"step": 2260
},
{
"epoch": 0.5341490675922113,
"grad_norm": 0.672885537147522,
"learning_rate": 0.00013972699458696163,
"loss": 0.1333,
"step": 2270
},
{
"epoch": 0.5365021471851286,
"grad_norm": 0.7414030432701111,
"learning_rate": 0.00013902094610496586,
"loss": 0.1288,
"step": 2280
},
{
"epoch": 0.5388552267780458,
"grad_norm": 1.1601518392562866,
"learning_rate": 0.0001383148976229701,
"loss": 0.1099,
"step": 2290
},
{
"epoch": 0.541208306370963,
"grad_norm": 0.4423375129699707,
"learning_rate": 0.00013760884914097433,
"loss": 0.1049,
"step": 2300
},
{
"epoch": 0.5435613859638803,
"grad_norm": 0.9248809218406677,
"learning_rate": 0.00013690280065897857,
"loss": 0.1172,
"step": 2310
},
{
"epoch": 0.5459144655567975,
"grad_norm": 1.3502943515777588,
"learning_rate": 0.0001361967521769828,
"loss": 0.1303,
"step": 2320
},
{
"epoch": 0.5482675451497147,
"grad_norm": 1.488297939300537,
"learning_rate": 0.00013549070369498704,
"loss": 0.1182,
"step": 2330
},
{
"epoch": 0.550620624742632,
"grad_norm": 0.6636572480201721,
"learning_rate": 0.0001347846552129913,
"loss": 0.1233,
"step": 2340
},
{
"epoch": 0.5529737043355492,
"grad_norm": 0.5864549279212952,
"learning_rate": 0.00013407860673099554,
"loss": 0.1102,
"step": 2350
},
{
"epoch": 0.5553267839284663,
"grad_norm": 1.9224406480789185,
"learning_rate": 0.00013337255824899977,
"loss": 0.1449,
"step": 2360
},
{
"epoch": 0.5576798635213837,
"grad_norm": 1.1239560842514038,
"learning_rate": 0.00013266650976700398,
"loss": 0.1155,
"step": 2370
},
{
"epoch": 0.5600329431143009,
"grad_norm": 0.6336050629615784,
"learning_rate": 0.00013196046128500821,
"loss": 0.1193,
"step": 2380
},
{
"epoch": 0.562386022707218,
"grad_norm": 0.9129360914230347,
"learning_rate": 0.00013125441280301245,
"loss": 0.1121,
"step": 2390
},
{
"epoch": 0.5647391023001354,
"grad_norm": 0.6220555305480957,
"learning_rate": 0.00013054836432101668,
"loss": 0.1172,
"step": 2400
},
{
"epoch": 0.5670921818930525,
"grad_norm": 0.8981531262397766,
"learning_rate": 0.00012984231583902092,
"loss": 0.1184,
"step": 2410
},
{
"epoch": 0.5694452614859697,
"grad_norm": 0.7610392570495605,
"learning_rate": 0.00012913626735702515,
"loss": 0.1204,
"step": 2420
},
{
"epoch": 0.5717983410788869,
"grad_norm": 0.5133729577064514,
"learning_rate": 0.0001284302188750294,
"loss": 0.1081,
"step": 2430
},
{
"epoch": 0.5741514206718042,
"grad_norm": 0.8097817897796631,
"learning_rate": 0.00012772417039303363,
"loss": 0.1142,
"step": 2440
},
{
"epoch": 0.5765045002647214,
"grad_norm": 1.8712083101272583,
"learning_rate": 0.00012701812191103786,
"loss": 0.1234,
"step": 2450
},
{
"epoch": 0.5788575798576386,
"grad_norm": 0.8425026535987854,
"learning_rate": 0.00012631207342904212,
"loss": 0.1027,
"step": 2460
},
{
"epoch": 0.5812106594505559,
"grad_norm": 0.5562009811401367,
"learning_rate": 0.00012560602494704636,
"loss": 0.0916,
"step": 2470
},
{
"epoch": 0.5835637390434731,
"grad_norm": 0.45057183504104614,
"learning_rate": 0.0001248999764650506,
"loss": 0.1166,
"step": 2480
},
{
"epoch": 0.5859168186363903,
"grad_norm": 0.5411068797111511,
"learning_rate": 0.00012419392798305483,
"loss": 0.1254,
"step": 2490
},
{
"epoch": 0.5882698982293076,
"grad_norm": 0.9400952458381653,
"learning_rate": 0.00012348787950105906,
"loss": 0.1411,
"step": 2500
},
{
"epoch": 0.5906229778222248,
"grad_norm": 0.4275170564651489,
"learning_rate": 0.0001227818310190633,
"loss": 0.1089,
"step": 2510
},
{
"epoch": 0.592976057415142,
"grad_norm": 1.2033214569091797,
"learning_rate": 0.00012207578253706753,
"loss": 0.108,
"step": 2520
},
{
"epoch": 0.5953291370080593,
"grad_norm": 1.257379412651062,
"learning_rate": 0.00012136973405507177,
"loss": 0.1207,
"step": 2530
},
{
"epoch": 0.5976822166009765,
"grad_norm": 0.7070032954216003,
"learning_rate": 0.000120663685573076,
"loss": 0.0879,
"step": 2540
},
{
"epoch": 0.6000352961938937,
"grad_norm": 0.8550868034362793,
"learning_rate": 0.00011995763709108024,
"loss": 0.1087,
"step": 2550
},
{
"epoch": 0.602388375786811,
"grad_norm": 0.8301357626914978,
"learning_rate": 0.00011925158860908447,
"loss": 0.1266,
"step": 2560
},
{
"epoch": 0.6047414553797282,
"grad_norm": 0.4070800542831421,
"learning_rate": 0.00011854554012708871,
"loss": 0.1062,
"step": 2570
},
{
"epoch": 0.6070945349726454,
"grad_norm": 1.1967391967773438,
"learning_rate": 0.00011783949164509294,
"loss": 0.1147,
"step": 2580
},
{
"epoch": 0.6094476145655627,
"grad_norm": 0.5281302332878113,
"learning_rate": 0.00011713344316309718,
"loss": 0.0912,
"step": 2590
},
{
"epoch": 0.6118006941584799,
"grad_norm": 0.5271784067153931,
"learning_rate": 0.00011642739468110142,
"loss": 0.1084,
"step": 2600
},
{
"epoch": 0.6141537737513971,
"grad_norm": 0.4973151683807373,
"learning_rate": 0.00011572134619910566,
"loss": 0.1242,
"step": 2610
},
{
"epoch": 0.6165068533443143,
"grad_norm": 0.4281303882598877,
"learning_rate": 0.0001150152977171099,
"loss": 0.1101,
"step": 2620
},
{
"epoch": 0.6188599329372316,
"grad_norm": 0.5142689347267151,
"learning_rate": 0.00011430924923511413,
"loss": 0.1182,
"step": 2630
},
{
"epoch": 0.6212130125301488,
"grad_norm": 0.5125661492347717,
"learning_rate": 0.00011360320075311837,
"loss": 0.0943,
"step": 2640
},
{
"epoch": 0.623566092123066,
"grad_norm": 0.43077680468559265,
"learning_rate": 0.0001128971522711226,
"loss": 0.0948,
"step": 2650
},
{
"epoch": 0.6259191717159833,
"grad_norm": 0.5074141621589661,
"learning_rate": 0.00011219110378912684,
"loss": 0.0853,
"step": 2660
},
{
"epoch": 0.6282722513089005,
"grad_norm": 0.8260855674743652,
"learning_rate": 0.00011148505530713107,
"loss": 0.1104,
"step": 2670
},
{
"epoch": 0.6306253309018177,
"grad_norm": 0.7819215059280396,
"learning_rate": 0.00011077900682513531,
"loss": 0.1256,
"step": 2680
},
{
"epoch": 0.632978410494735,
"grad_norm": 0.46884438395500183,
"learning_rate": 0.00011007295834313955,
"loss": 0.1027,
"step": 2690
},
{
"epoch": 0.6353314900876522,
"grad_norm": 0.9515593647956848,
"learning_rate": 0.00010936690986114378,
"loss": 0.112,
"step": 2700
},
{
"epoch": 0.6376845696805694,
"grad_norm": 0.3602767586708069,
"learning_rate": 0.00010866086137914803,
"loss": 0.1053,
"step": 2710
},
{
"epoch": 0.6400376492734867,
"grad_norm": 0.7740781903266907,
"learning_rate": 0.00010795481289715226,
"loss": 0.112,
"step": 2720
},
{
"epoch": 0.6423907288664039,
"grad_norm": 0.5003033876419067,
"learning_rate": 0.0001072487644151565,
"loss": 0.0985,
"step": 2730
},
{
"epoch": 0.6447438084593211,
"grad_norm": 0.4092664122581482,
"learning_rate": 0.00010654271593316073,
"loss": 0.1113,
"step": 2740
},
{
"epoch": 0.6470968880522384,
"grad_norm": 0.446584552526474,
"learning_rate": 0.00010583666745116497,
"loss": 0.0909,
"step": 2750
},
{
"epoch": 0.6494499676451556,
"grad_norm": 0.3130131661891937,
"learning_rate": 0.0001051306189691692,
"loss": 0.0954,
"step": 2760
},
{
"epoch": 0.6518030472380728,
"grad_norm": 0.7232083082199097,
"learning_rate": 0.00010442457048717344,
"loss": 0.1132,
"step": 2770
},
{
"epoch": 0.65415612683099,
"grad_norm": 0.5579691529273987,
"learning_rate": 0.00010371852200517768,
"loss": 0.1045,
"step": 2780
},
{
"epoch": 0.6565092064239073,
"grad_norm": 0.5319089889526367,
"learning_rate": 0.00010301247352318191,
"loss": 0.1215,
"step": 2790
},
{
"epoch": 0.6588622860168245,
"grad_norm": 0.516445517539978,
"learning_rate": 0.00010230642504118615,
"loss": 0.111,
"step": 2800
},
{
"epoch": 0.6612153656097417,
"grad_norm": 0.25264236330986023,
"learning_rate": 0.0001016003765591904,
"loss": 0.1126,
"step": 2810
},
{
"epoch": 0.663568445202659,
"grad_norm": 0.7910987138748169,
"learning_rate": 0.00010089432807719463,
"loss": 0.1306,
"step": 2820
},
{
"epoch": 0.6659215247955762,
"grad_norm": 0.7823461890220642,
"learning_rate": 0.00010018827959519886,
"loss": 0.0967,
"step": 2830
},
{
"epoch": 0.6682746043884934,
"grad_norm": 0.7126127481460571,
"learning_rate": 9.94822311132031e-05,
"loss": 0.1296,
"step": 2840
},
{
"epoch": 0.6706276839814107,
"grad_norm": 0.9327739477157593,
"learning_rate": 9.877618263120733e-05,
"loss": 0.1115,
"step": 2850
},
{
"epoch": 0.6729807635743279,
"grad_norm": 0.7680268883705139,
"learning_rate": 9.807013414921157e-05,
"loss": 0.1055,
"step": 2860
},
{
"epoch": 0.6753338431672451,
"grad_norm": 0.7711540460586548,
"learning_rate": 9.73640856672158e-05,
"loss": 0.0951,
"step": 2870
},
{
"epoch": 0.6776869227601624,
"grad_norm": 0.5041959881782532,
"learning_rate": 9.665803718522004e-05,
"loss": 0.1087,
"step": 2880
},
{
"epoch": 0.6800400023530796,
"grad_norm": 0.5102591514587402,
"learning_rate": 9.595198870322428e-05,
"loss": 0.1176,
"step": 2890
},
{
"epoch": 0.6823930819459968,
"grad_norm": 0.7100384831428528,
"learning_rate": 9.524594022122851e-05,
"loss": 0.1091,
"step": 2900
},
{
"epoch": 0.6847461615389141,
"grad_norm": 0.6806867122650146,
"learning_rate": 9.453989173923276e-05,
"loss": 0.1251,
"step": 2910
},
{
"epoch": 0.6870992411318313,
"grad_norm": 0.6659530401229858,
"learning_rate": 9.3833843257237e-05,
"loss": 0.0835,
"step": 2920
},
{
"epoch": 0.6894523207247485,
"grad_norm": 0.4317012429237366,
"learning_rate": 9.312779477524123e-05,
"loss": 0.0893,
"step": 2930
},
{
"epoch": 0.6918054003176658,
"grad_norm": 0.5916824340820312,
"learning_rate": 9.242174629324546e-05,
"loss": 0.0905,
"step": 2940
},
{
"epoch": 0.694158479910583,
"grad_norm": 0.7429795265197754,
"learning_rate": 9.17156978112497e-05,
"loss": 0.1063,
"step": 2950
},
{
"epoch": 0.6965115595035002,
"grad_norm": 0.87420254945755,
"learning_rate": 9.100964932925394e-05,
"loss": 0.1042,
"step": 2960
},
{
"epoch": 0.6988646390964174,
"grad_norm": 0.49567267298698425,
"learning_rate": 9.030360084725817e-05,
"loss": 0.1045,
"step": 2970
},
{
"epoch": 0.7012177186893347,
"grad_norm": 2.633138418197632,
"learning_rate": 8.95975523652624e-05,
"loss": 0.0884,
"step": 2980
},
{
"epoch": 0.7035707982822519,
"grad_norm": 0.33752286434173584,
"learning_rate": 8.889150388326664e-05,
"loss": 0.0848,
"step": 2990
},
{
"epoch": 0.7059238778751691,
"grad_norm": 0.5974826812744141,
"learning_rate": 8.818545540127088e-05,
"loss": 0.0971,
"step": 3000
},
{
"epoch": 0.7082769574680864,
"grad_norm": 0.43427976965904236,
"learning_rate": 8.747940691927512e-05,
"loss": 0.1165,
"step": 3010
},
{
"epoch": 0.7106300370610036,
"grad_norm": 0.7770646810531616,
"learning_rate": 8.677335843727936e-05,
"loss": 0.1084,
"step": 3020
},
{
"epoch": 0.7129831166539208,
"grad_norm": 0.5276495218276978,
"learning_rate": 8.60673099552836e-05,
"loss": 0.111,
"step": 3030
},
{
"epoch": 0.7153361962468381,
"grad_norm": 0.9737383127212524,
"learning_rate": 8.536126147328783e-05,
"loss": 0.0972,
"step": 3040
},
{
"epoch": 0.7176892758397553,
"grad_norm": 0.36562997102737427,
"learning_rate": 8.465521299129207e-05,
"loss": 0.093,
"step": 3050
},
{
"epoch": 0.7200423554326725,
"grad_norm": 0.8244528770446777,
"learning_rate": 8.39491645092963e-05,
"loss": 0.1263,
"step": 3060
},
{
"epoch": 0.7223954350255898,
"grad_norm": 1.9532008171081543,
"learning_rate": 8.324311602730054e-05,
"loss": 0.1251,
"step": 3070
},
{
"epoch": 0.724748514618507,
"grad_norm": 0.572896420955658,
"learning_rate": 8.253706754530477e-05,
"loss": 0.0875,
"step": 3080
},
{
"epoch": 0.7271015942114242,
"grad_norm": 1.2975929975509644,
"learning_rate": 8.1831019063309e-05,
"loss": 0.1023,
"step": 3090
},
{
"epoch": 0.7294546738043415,
"grad_norm": 0.5758102536201477,
"learning_rate": 8.112497058131324e-05,
"loss": 0.1019,
"step": 3100
},
{
"epoch": 0.7318077533972587,
"grad_norm": 0.553327202796936,
"learning_rate": 8.041892209931749e-05,
"loss": 0.1128,
"step": 3110
},
{
"epoch": 0.7341608329901759,
"grad_norm": 0.5465438961982727,
"learning_rate": 7.971287361732173e-05,
"loss": 0.1237,
"step": 3120
},
{
"epoch": 0.7365139125830932,
"grad_norm": 0.46917715668678284,
"learning_rate": 7.900682513532596e-05,
"loss": 0.0964,
"step": 3130
},
{
"epoch": 0.7388669921760104,
"grad_norm": 0.8454899787902832,
"learning_rate": 7.83007766533302e-05,
"loss": 0.0901,
"step": 3140
},
{
"epoch": 0.7412200717689276,
"grad_norm": 0.8698781728744507,
"learning_rate": 7.759472817133443e-05,
"loss": 0.1119,
"step": 3150
},
{
"epoch": 0.7435731513618448,
"grad_norm": 1.7399003505706787,
"learning_rate": 7.688867968933867e-05,
"loss": 0.1122,
"step": 3160
},
{
"epoch": 0.7459262309547621,
"grad_norm": 0.4506986141204834,
"learning_rate": 7.61826312073429e-05,
"loss": 0.09,
"step": 3170
},
{
"epoch": 0.7482793105476793,
"grad_norm": 0.7856936454772949,
"learning_rate": 7.547658272534714e-05,
"loss": 0.1194,
"step": 3180
},
{
"epoch": 0.7506323901405965,
"grad_norm": 0.5778619050979614,
"learning_rate": 7.477053424335137e-05,
"loss": 0.1167,
"step": 3190
},
{
"epoch": 0.7529854697335138,
"grad_norm": 0.4940952658653259,
"learning_rate": 7.40644857613556e-05,
"loss": 0.0813,
"step": 3200
},
{
"epoch": 0.755338549326431,
"grad_norm": 1.1496696472167969,
"learning_rate": 7.335843727935984e-05,
"loss": 0.0966,
"step": 3210
},
{
"epoch": 0.7576916289193482,
"grad_norm": 0.4551859498023987,
"learning_rate": 7.265238879736408e-05,
"loss": 0.0956,
"step": 3220
},
{
"epoch": 0.7600447085122655,
"grad_norm": 0.5476594567298889,
"learning_rate": 7.194634031536831e-05,
"loss": 0.1225,
"step": 3230
},
{
"epoch": 0.7623977881051827,
"grad_norm": 0.4413054287433624,
"learning_rate": 7.124029183337255e-05,
"loss": 0.0996,
"step": 3240
},
{
"epoch": 0.7647508676980999,
"grad_norm": 0.6522489190101624,
"learning_rate": 7.053424335137678e-05,
"loss": 0.0946,
"step": 3250
},
{
"epoch": 0.7671039472910172,
"grad_norm": 0.4750779867172241,
"learning_rate": 6.982819486938102e-05,
"loss": 0.0788,
"step": 3260
},
{
"epoch": 0.7694570268839344,
"grad_norm": 0.336505264043808,
"learning_rate": 6.912214638738527e-05,
"loss": 0.0841,
"step": 3270
},
{
"epoch": 0.7718101064768516,
"grad_norm": 1.4274874925613403,
"learning_rate": 6.84160979053895e-05,
"loss": 0.0991,
"step": 3280
},
{
"epoch": 0.7741631860697689,
"grad_norm": 0.6464115977287292,
"learning_rate": 6.771004942339374e-05,
"loss": 0.1172,
"step": 3290
},
{
"epoch": 0.7765162656626861,
"grad_norm": 0.35535725951194763,
"learning_rate": 6.700400094139797e-05,
"loss": 0.09,
"step": 3300
},
{
"epoch": 0.7788693452556033,
"grad_norm": 0.22626227140426636,
"learning_rate": 6.62979524594022e-05,
"loss": 0.089,
"step": 3310
},
{
"epoch": 0.7812224248485204,
"grad_norm": 0.6091925501823425,
"learning_rate": 6.559190397740644e-05,
"loss": 0.0851,
"step": 3320
},
{
"epoch": 0.7835755044414378,
"grad_norm": 2.3381729125976562,
"learning_rate": 6.488585549541068e-05,
"loss": 0.1001,
"step": 3330
},
{
"epoch": 0.785928584034355,
"grad_norm": 0.41597291827201843,
"learning_rate": 6.417980701341491e-05,
"loss": 0.0985,
"step": 3340
},
{
"epoch": 0.7882816636272721,
"grad_norm": 0.6187950372695923,
"learning_rate": 6.347375853141915e-05,
"loss": 0.0877,
"step": 3350
},
{
"epoch": 0.7906347432201895,
"grad_norm": 0.4807620942592621,
"learning_rate": 6.276771004942338e-05,
"loss": 0.1074,
"step": 3360
},
{
"epoch": 0.7929878228131066,
"grad_norm": 0.2998965382575989,
"learning_rate": 6.206166156742763e-05,
"loss": 0.1044,
"step": 3370
},
{
"epoch": 0.7953409024060238,
"grad_norm": 0.5904129147529602,
"learning_rate": 6.135561308543187e-05,
"loss": 0.1073,
"step": 3380
},
{
"epoch": 0.7976939819989411,
"grad_norm": 0.6356788277626038,
"learning_rate": 6.06495646034361e-05,
"loss": 0.1121,
"step": 3390
},
{
"epoch": 0.8000470615918583,
"grad_norm": 0.9147433638572693,
"learning_rate": 5.994351612144034e-05,
"loss": 0.1103,
"step": 3400
},
{
"epoch": 0.8024001411847755,
"grad_norm": 0.8032605051994324,
"learning_rate": 5.923746763944457e-05,
"loss": 0.0965,
"step": 3410
},
{
"epoch": 0.8047532207776928,
"grad_norm": 0.7935906052589417,
"learning_rate": 5.853141915744881e-05,
"loss": 0.1007,
"step": 3420
},
{
"epoch": 0.80710630037061,
"grad_norm": 0.4112412631511688,
"learning_rate": 5.782537067545304e-05,
"loss": 0.0882,
"step": 3430
},
{
"epoch": 0.8094593799635272,
"grad_norm": 0.8190514445304871,
"learning_rate": 5.7119322193457284e-05,
"loss": 0.1019,
"step": 3440
},
{
"epoch": 0.8118124595564445,
"grad_norm": 0.6029698848724365,
"learning_rate": 5.641327371146152e-05,
"loss": 0.1053,
"step": 3450
},
{
"epoch": 0.8141655391493617,
"grad_norm": 0.43347781896591187,
"learning_rate": 5.5707225229465755e-05,
"loss": 0.1044,
"step": 3460
},
{
"epoch": 0.8165186187422789,
"grad_norm": 1.5235440731048584,
"learning_rate": 5.500117674746999e-05,
"loss": 0.0982,
"step": 3470
},
{
"epoch": 0.8188716983351962,
"grad_norm": 0.5716174244880676,
"learning_rate": 5.4295128265474225e-05,
"loss": 0.1078,
"step": 3480
},
{
"epoch": 0.8212247779281134,
"grad_norm": 1.5008090734481812,
"learning_rate": 5.358907978347847e-05,
"loss": 0.0915,
"step": 3490
},
{
"epoch": 0.8235778575210306,
"grad_norm": 0.49782001972198486,
"learning_rate": 5.28830313014827e-05,
"loss": 0.0892,
"step": 3500
},
{
"epoch": 0.8259309371139478,
"grad_norm": 0.4466950297355652,
"learning_rate": 5.217698281948694e-05,
"loss": 0.0905,
"step": 3510
},
{
"epoch": 0.8282840167068651,
"grad_norm": 0.5504721403121948,
"learning_rate": 5.147093433749117e-05,
"loss": 0.1128,
"step": 3520
},
{
"epoch": 0.8306370962997823,
"grad_norm": 0.4870951473712921,
"learning_rate": 5.076488585549541e-05,
"loss": 0.0876,
"step": 3530
},
{
"epoch": 0.8329901758926995,
"grad_norm": 0.6789172887802124,
"learning_rate": 5.005883737349965e-05,
"loss": 0.1004,
"step": 3540
},
{
"epoch": 0.8353432554856168,
"grad_norm": 0.5021870136260986,
"learning_rate": 4.935278889150388e-05,
"loss": 0.0862,
"step": 3550
},
{
"epoch": 0.837696335078534,
"grad_norm": 0.5829181671142578,
"learning_rate": 4.864674040950811e-05,
"loss": 0.0994,
"step": 3560
},
{
"epoch": 0.8400494146714512,
"grad_norm": 1.029181957244873,
"learning_rate": 4.794069192751235e-05,
"loss": 0.1144,
"step": 3570
},
{
"epoch": 0.8424024942643685,
"grad_norm": 0.6730376482009888,
"learning_rate": 4.723464344551658e-05,
"loss": 0.106,
"step": 3580
},
{
"epoch": 0.8447555738572857,
"grad_norm": 0.6129499673843384,
"learning_rate": 4.652859496352082e-05,
"loss": 0.0896,
"step": 3590
},
{
"epoch": 0.8471086534502029,
"grad_norm": 0.422830194234848,
"learning_rate": 4.582254648152506e-05,
"loss": 0.0846,
"step": 3600
},
{
"epoch": 0.8494617330431202,
"grad_norm": 0.5306664109230042,
"learning_rate": 4.5116497999529296e-05,
"loss": 0.1059,
"step": 3610
},
{
"epoch": 0.8518148126360374,
"grad_norm": 0.6436883807182312,
"learning_rate": 4.441044951753353e-05,
"loss": 0.1132,
"step": 3620
},
{
"epoch": 0.8541678922289546,
"grad_norm": 0.4121890962123871,
"learning_rate": 4.3704401035537766e-05,
"loss": 0.0864,
"step": 3630
},
{
"epoch": 0.8565209718218719,
"grad_norm": 0.42521169781684875,
"learning_rate": 4.2998352553542e-05,
"loss": 0.1011,
"step": 3640
},
{
"epoch": 0.8588740514147891,
"grad_norm": 0.49623095989227295,
"learning_rate": 4.229230407154624e-05,
"loss": 0.1093,
"step": 3650
},
{
"epoch": 0.8612271310077063,
"grad_norm": 0.5516742467880249,
"learning_rate": 4.158625558955048e-05,
"loss": 0.1009,
"step": 3660
},
{
"epoch": 0.8635802106006236,
"grad_norm": 0.37128451466560364,
"learning_rate": 4.0880207107554713e-05,
"loss": 0.0717,
"step": 3670
},
{
"epoch": 0.8659332901935408,
"grad_norm": 0.3802624046802521,
"learning_rate": 4.017415862555895e-05,
"loss": 0.0891,
"step": 3680
},
{
"epoch": 0.868286369786458,
"grad_norm": 0.35558944940567017,
"learning_rate": 3.9468110143563184e-05,
"loss": 0.0863,
"step": 3690
},
{
"epoch": 0.8706394493793752,
"grad_norm": 0.2548139989376068,
"learning_rate": 3.8762061661567426e-05,
"loss": 0.0946,
"step": 3700
},
{
"epoch": 0.8729925289722925,
"grad_norm": 0.3489900827407837,
"learning_rate": 3.805601317957166e-05,
"loss": 0.0794,
"step": 3710
},
{
"epoch": 0.8753456085652097,
"grad_norm": 0.7514833807945251,
"learning_rate": 3.7349964697575896e-05,
"loss": 0.1026,
"step": 3720
},
{
"epoch": 0.8776986881581269,
"grad_norm": 0.28846803307533264,
"learning_rate": 3.664391621558013e-05,
"loss": 0.107,
"step": 3730
},
{
"epoch": 0.8800517677510442,
"grad_norm": 0.3054257333278656,
"learning_rate": 3.5937867733584366e-05,
"loss": 0.0839,
"step": 3740
},
{
"epoch": 0.8824048473439614,
"grad_norm": 0.487393856048584,
"learning_rate": 3.523181925158861e-05,
"loss": 0.099,
"step": 3750
},
{
"epoch": 0.8847579269368786,
"grad_norm": 0.7874276041984558,
"learning_rate": 3.4525770769592843e-05,
"loss": 0.0873,
"step": 3760
},
{
"epoch": 0.8871110065297959,
"grad_norm": 0.3583498001098633,
"learning_rate": 3.381972228759708e-05,
"loss": 0.0854,
"step": 3770
},
{
"epoch": 0.8894640861227131,
"grad_norm": 0.5606823563575745,
"learning_rate": 3.3113673805601314e-05,
"loss": 0.1106,
"step": 3780
},
{
"epoch": 0.8918171657156303,
"grad_norm": 0.48208296298980713,
"learning_rate": 3.240762532360555e-05,
"loss": 0.1138,
"step": 3790
},
{
"epoch": 0.8941702453085476,
"grad_norm": 1.026995301246643,
"learning_rate": 3.170157684160979e-05,
"loss": 0.0877,
"step": 3800
},
{
"epoch": 0.8965233249014648,
"grad_norm": 0.7940952777862549,
"learning_rate": 3.0995528359614026e-05,
"loss": 0.069,
"step": 3810
},
{
"epoch": 0.898876404494382,
"grad_norm": 0.7711090445518494,
"learning_rate": 3.028947987761826e-05,
"loss": 0.0884,
"step": 3820
},
{
"epoch": 0.9012294840872993,
"grad_norm": 0.6985650062561035,
"learning_rate": 2.9583431395622496e-05,
"loss": 0.0929,
"step": 3830
},
{
"epoch": 0.9035825636802165,
"grad_norm": 0.5291894674301147,
"learning_rate": 2.8877382913626735e-05,
"loss": 0.1166,
"step": 3840
},
{
"epoch": 0.9059356432731337,
"grad_norm": 0.3929837644100189,
"learning_rate": 2.817133443163097e-05,
"loss": 0.0904,
"step": 3850
},
{
"epoch": 0.9082887228660509,
"grad_norm": 0.492017537355423,
"learning_rate": 2.746528594963521e-05,
"loss": 0.0986,
"step": 3860
},
{
"epoch": 0.9106418024589682,
"grad_norm": 0.5756918787956238,
"learning_rate": 2.6759237467639444e-05,
"loss": 0.0897,
"step": 3870
},
{
"epoch": 0.9129948820518854,
"grad_norm": 0.5781024098396301,
"learning_rate": 2.605318898564368e-05,
"loss": 0.0799,
"step": 3880
},
{
"epoch": 0.9153479616448026,
"grad_norm": 0.28270334005355835,
"learning_rate": 2.5347140503647918e-05,
"loss": 0.0889,
"step": 3890
},
{
"epoch": 0.9177010412377199,
"grad_norm": 0.5788043737411499,
"learning_rate": 2.464109202165215e-05,
"loss": 0.0788,
"step": 3900
},
{
"epoch": 0.9200541208306371,
"grad_norm": 0.563836932182312,
"learning_rate": 2.3935043539656384e-05,
"loss": 0.0914,
"step": 3910
},
{
"epoch": 0.9224072004235543,
"grad_norm": 0.4077290892601013,
"learning_rate": 2.3228995057660623e-05,
"loss": 0.1057,
"step": 3920
},
{
"epoch": 0.9247602800164716,
"grad_norm": 0.6209468841552734,
"learning_rate": 2.2522946575664858e-05,
"loss": 0.0812,
"step": 3930
},
{
"epoch": 0.9271133596093888,
"grad_norm": 0.542506754398346,
"learning_rate": 2.1816898093669097e-05,
"loss": 0.0951,
"step": 3940
},
{
"epoch": 0.929466439202306,
"grad_norm": 0.5754973292350769,
"learning_rate": 2.1110849611673332e-05,
"loss": 0.1023,
"step": 3950
},
{
"epoch": 0.9318195187952233,
"grad_norm": 0.3798030912876129,
"learning_rate": 2.0404801129677567e-05,
"loss": 0.0997,
"step": 3960
},
{
"epoch": 0.9341725983881405,
"grad_norm": 0.6593634486198425,
"learning_rate": 1.9698752647681806e-05,
"loss": 0.1056,
"step": 3970
},
{
"epoch": 0.9365256779810577,
"grad_norm": 0.46481505036354065,
"learning_rate": 1.899270416568604e-05,
"loss": 0.0795,
"step": 3980
},
{
"epoch": 0.938878757573975,
"grad_norm": 0.5140686631202698,
"learning_rate": 1.828665568369028e-05,
"loss": 0.0809,
"step": 3990
},
{
"epoch": 0.9412318371668922,
"grad_norm": 0.8201892375946045,
"learning_rate": 1.7580607201694515e-05,
"loss": 0.0851,
"step": 4000
},
{
"epoch": 0.9435849167598094,
"grad_norm": 0.3848946690559387,
"learning_rate": 1.687455871969875e-05,
"loss": 0.0789,
"step": 4010
},
{
"epoch": 0.9459379963527267,
"grad_norm": 0.3362932503223419,
"learning_rate": 1.6168510237702988e-05,
"loss": 0.0841,
"step": 4020
},
{
"epoch": 0.9482910759456439,
"grad_norm": 0.400037556886673,
"learning_rate": 1.5462461755707223e-05,
"loss": 0.1004,
"step": 4030
},
{
"epoch": 0.9506441555385611,
"grad_norm": 0.6505069136619568,
"learning_rate": 1.475641327371146e-05,
"loss": 0.0977,
"step": 4040
},
{
"epoch": 0.9529972351314783,
"grad_norm": 0.710784375667572,
"learning_rate": 1.4050364791715697e-05,
"loss": 0.085,
"step": 4050
},
{
"epoch": 0.9553503147243956,
"grad_norm": 0.4263714849948883,
"learning_rate": 1.3344316309719934e-05,
"loss": 0.1044,
"step": 4060
},
{
"epoch": 0.9577033943173128,
"grad_norm": 0.42400240898132324,
"learning_rate": 1.2638267827724171e-05,
"loss": 0.1113,
"step": 4070
},
{
"epoch": 0.96005647391023,
"grad_norm": 0.2722209393978119,
"learning_rate": 1.1932219345728404e-05,
"loss": 0.0792,
"step": 4080
},
{
"epoch": 0.9624095535031473,
"grad_norm": 0.9779515862464905,
"learning_rate": 1.1226170863732641e-05,
"loss": 0.1044,
"step": 4090
},
{
"epoch": 0.9647626330960645,
"grad_norm": 1.028387188911438,
"learning_rate": 1.0520122381736878e-05,
"loss": 0.1043,
"step": 4100
},
{
"epoch": 0.9671157126889817,
"grad_norm": 0.5009176135063171,
"learning_rate": 9.814073899741115e-06,
"loss": 0.087,
"step": 4110
},
{
"epoch": 0.969468792281899,
"grad_norm": 0.33020302653312683,
"learning_rate": 9.10802541774535e-06,
"loss": 0.092,
"step": 4120
},
{
"epoch": 0.9718218718748162,
"grad_norm": 0.4314991533756256,
"learning_rate": 8.401976935749587e-06,
"loss": 0.075,
"step": 4130
},
{
"epoch": 0.9741749514677334,
"grad_norm": 0.6121822595596313,
"learning_rate": 7.695928453753824e-06,
"loss": 0.0892,
"step": 4140
},
{
"epoch": 0.9765280310606507,
"grad_norm": 0.3374115824699402,
"learning_rate": 6.989879971758061e-06,
"loss": 0.0755,
"step": 4150
},
{
"epoch": 0.9788811106535679,
"grad_norm": 0.5865825414657593,
"learning_rate": 6.283831489762297e-06,
"loss": 0.0957,
"step": 4160
},
{
"epoch": 0.9812341902464851,
"grad_norm": 0.2131696194410324,
"learning_rate": 5.577783007766533e-06,
"loss": 0.0849,
"step": 4170
},
{
"epoch": 0.9835872698394024,
"grad_norm": 1.3489303588867188,
"learning_rate": 4.871734525770769e-06,
"loss": 0.098,
"step": 4180
},
{
"epoch": 0.9859403494323196,
"grad_norm": 0.15470068156719208,
"learning_rate": 4.1656860437750056e-06,
"loss": 0.0847,
"step": 4190
},
{
"epoch": 0.9882934290252368,
"grad_norm": 0.8059414625167847,
"learning_rate": 3.459637561779242e-06,
"loss": 0.1094,
"step": 4200
},
{
"epoch": 0.990646508618154,
"grad_norm": 0.6808902621269226,
"learning_rate": 2.753589079783478e-06,
"loss": 0.0867,
"step": 4210
},
{
"epoch": 0.9929995882110713,
"grad_norm": 0.29802441596984863,
"learning_rate": 2.0475405977877145e-06,
"loss": 0.0866,
"step": 4220
},
{
"epoch": 0.9953526678039885,
"grad_norm": 0.5227815508842468,
"learning_rate": 1.341492115791951e-06,
"loss": 0.0758,
"step": 4230
},
{
"epoch": 0.9977057473969057,
"grad_norm": 0.25185248255729675,
"learning_rate": 6.354436337961872e-07,
"loss": 0.0906,
"step": 4240
},
{
"epoch": 0.9998235190305312,
"step": 4249,
"total_flos": 1.037665224400896e+16,
"train_loss": 0.1519955188758403,
"train_runtime": 5739.131,
"train_samples_per_second": 11.848,
"train_steps_per_second": 0.74
}
],
"logging_steps": 10,
"max_steps": 4249,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 1.037665224400896e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}