{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998235190305312, "eval_steps": 500, "global_step": 4249, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023530795929172306, "grad_norm": 4.500667572021484, "learning_rate": 0.0002992939515180042, "loss": 2.9988, "step": 10 }, { "epoch": 0.004706159185834461, "grad_norm": 2.7827706336975098, "learning_rate": 0.00029858790303600844, "loss": 1.0643, "step": 20 }, { "epoch": 0.007059238778751691, "grad_norm": 2.746577739715576, "learning_rate": 0.0002978818545540127, "loss": 0.7978, "step": 30 }, { "epoch": 0.009412318371668922, "grad_norm": 2.567692279815674, "learning_rate": 0.0002971758060720169, "loss": 0.6546, "step": 40 }, { "epoch": 0.011765397964586153, "grad_norm": 2.2394003868103027, "learning_rate": 0.00029646975759002115, "loss": 0.5361, "step": 50 }, { "epoch": 0.014118477557503383, "grad_norm": 2.1666100025177, "learning_rate": 0.0002957637091080254, "loss": 0.5137, "step": 60 }, { "epoch": 0.01647155715042061, "grad_norm": 1.922058343887329, "learning_rate": 0.0002950576606260296, "loss": 0.446, "step": 70 }, { "epoch": 0.018824636743337845, "grad_norm": 1.732611060142517, "learning_rate": 0.00029435161214403386, "loss": 0.4405, "step": 80 }, { "epoch": 0.021177716336255075, "grad_norm": 2.2046239376068115, "learning_rate": 0.0002936455636620381, "loss": 0.3959, "step": 90 }, { "epoch": 0.023530795929172305, "grad_norm": 2.083113670349121, "learning_rate": 0.0002929395151800423, "loss": 0.4114, "step": 100 }, { "epoch": 0.025883875522089535, "grad_norm": 1.671247124671936, "learning_rate": 0.00029223346669804656, "loss": 0.3554, "step": 110 }, { "epoch": 0.028236955115006766, "grad_norm": 2.001924514770508, "learning_rate": 0.0002915274182160508, "loss": 0.3577, "step": 120 }, { "epoch": 0.030590034707923996, "grad_norm": 2.07259202003479, "learning_rate": 0.00029082136973405503, "loss": 0.3422, "step": 130 }, { "epoch": 0.03294311430084122, "grad_norm": 1.7239247560501099, "learning_rate": 0.00029011532125205927, "loss": 0.3079, "step": 140 }, { "epoch": 0.03529619389375846, "grad_norm": 1.7430157661437988, "learning_rate": 0.0002894092727700635, "loss": 0.3304, "step": 150 }, { "epoch": 0.03764927348667569, "grad_norm": 1.1152617931365967, "learning_rate": 0.00028870322428806774, "loss": 0.3009, "step": 160 }, { "epoch": 0.04000235307959292, "grad_norm": 1.5272759199142456, "learning_rate": 0.00028799717580607197, "loss": 0.3027, "step": 170 }, { "epoch": 0.04235543267251015, "grad_norm": 1.3934285640716553, "learning_rate": 0.0002872911273240762, "loss": 0.2514, "step": 180 }, { "epoch": 0.04470851226542738, "grad_norm": 1.7138372659683228, "learning_rate": 0.00028658507884208044, "loss": 0.2556, "step": 190 }, { "epoch": 0.04706159185834461, "grad_norm": 1.7979109287261963, "learning_rate": 0.00028587903036008473, "loss": 0.2696, "step": 200 }, { "epoch": 0.04941467145126184, "grad_norm": 1.342785358428955, "learning_rate": 0.0002851729818780889, "loss": 0.2496, "step": 210 }, { "epoch": 0.05176775104417907, "grad_norm": 1.5516395568847656, "learning_rate": 0.0002844669333960932, "loss": 0.2727, "step": 220 }, { "epoch": 0.0541208306370963, "grad_norm": 6.922358989715576, "learning_rate": 0.0002837608849140974, "loss": 0.2492, "step": 230 }, { "epoch": 0.05647391023001353, "grad_norm": 1.5551228523254395, "learning_rate": 0.00028305483643210167, "loss": 0.2451, "step": 240 }, { "epoch": 0.05882698982293076, "grad_norm": 1.300445318222046, "learning_rate": 0.00028234878795010585, "loss": 0.253, "step": 250 }, { "epoch": 0.06118006941584799, "grad_norm": 1.4355467557907104, "learning_rate": 0.00028164273946811014, "loss": 0.2453, "step": 260 }, { "epoch": 0.06353314900876522, "grad_norm": 15.9704008102417, "learning_rate": 0.0002809366909861143, "loss": 0.2072, "step": 270 }, { "epoch": 0.06588622860168244, "grad_norm": 1.7124171257019043, "learning_rate": 0.0002802306425041186, "loss": 0.4367, "step": 280 }, { "epoch": 0.06823930819459968, "grad_norm": 1.6787582635879517, "learning_rate": 0.0002795245940221228, "loss": 0.2716, "step": 290 }, { "epoch": 0.07059238778751692, "grad_norm": 1.2618638277053833, "learning_rate": 0.0002788185455401271, "loss": 0.2821, "step": 300 }, { "epoch": 0.07294546738043414, "grad_norm": 2.927347421646118, "learning_rate": 0.00027811249705813126, "loss": 0.2712, "step": 310 }, { "epoch": 0.07529854697335138, "grad_norm": 1.9304898977279663, "learning_rate": 0.00027740644857613555, "loss": 0.2637, "step": 320 }, { "epoch": 0.0776516265662686, "grad_norm": 1.2599807977676392, "learning_rate": 0.0002767004000941398, "loss": 0.2389, "step": 330 }, { "epoch": 0.08000470615918584, "grad_norm": 1.4264953136444092, "learning_rate": 0.000275994351612144, "loss": 0.2143, "step": 340 }, { "epoch": 0.08235778575210306, "grad_norm": 1.4229093790054321, "learning_rate": 0.00027528830313014826, "loss": 0.2571, "step": 350 }, { "epoch": 0.0847108653450203, "grad_norm": 1.743034839630127, "learning_rate": 0.0002745822546481525, "loss": 0.2292, "step": 360 }, { "epoch": 0.08706394493793752, "grad_norm": 1.3582898378372192, "learning_rate": 0.00027387620616615673, "loss": 0.2314, "step": 370 }, { "epoch": 0.08941702453085476, "grad_norm": 1.2714539766311646, "learning_rate": 0.00027317015768416096, "loss": 0.2694, "step": 380 }, { "epoch": 0.09177010412377198, "grad_norm": 1.0213568210601807, "learning_rate": 0.0002724641092021652, "loss": 0.2269, "step": 390 }, { "epoch": 0.09412318371668922, "grad_norm": 0.8783596754074097, "learning_rate": 0.00027175806072016943, "loss": 0.2488, "step": 400 }, { "epoch": 0.09647626330960644, "grad_norm": 1.842328667640686, "learning_rate": 0.00027105201223817367, "loss": 0.2175, "step": 410 }, { "epoch": 0.09882934290252368, "grad_norm": 1.4185247421264648, "learning_rate": 0.0002703459637561779, "loss": 0.2049, "step": 420 }, { "epoch": 0.1011824224954409, "grad_norm": 1.3057924509048462, "learning_rate": 0.00026963991527418214, "loss": 0.1819, "step": 430 }, { "epoch": 0.10353550208835814, "grad_norm": 1.563916802406311, "learning_rate": 0.0002689338667921864, "loss": 0.2042, "step": 440 }, { "epoch": 0.10588858168127536, "grad_norm": 0.9588648080825806, "learning_rate": 0.0002682278183101906, "loss": 0.1977, "step": 450 }, { "epoch": 0.1082416612741926, "grad_norm": 1.3258203268051147, "learning_rate": 0.00026752176982819485, "loss": 0.1984, "step": 460 }, { "epoch": 0.11059474086710983, "grad_norm": 1.6783477067947388, "learning_rate": 0.0002668157213461991, "loss": 0.2086, "step": 470 }, { "epoch": 0.11294782046002706, "grad_norm": 1.820469617843628, "learning_rate": 0.0002661096728642033, "loss": 0.2128, "step": 480 }, { "epoch": 0.11530090005294429, "grad_norm": 1.1493395566940308, "learning_rate": 0.00026540362438220755, "loss": 0.2022, "step": 490 }, { "epoch": 0.11765397964586152, "grad_norm": 1.3134245872497559, "learning_rate": 0.0002646975759002118, "loss": 0.2199, "step": 500 }, { "epoch": 0.12000705923877875, "grad_norm": 4.0345988273620605, "learning_rate": 0.000263991527418216, "loss": 0.2181, "step": 510 }, { "epoch": 0.12236013883169598, "grad_norm": 1.7995352745056152, "learning_rate": 0.00026328547893622026, "loss": 0.2233, "step": 520 }, { "epoch": 0.1247132184246132, "grad_norm": 1.6303889751434326, "learning_rate": 0.0002625794304542245, "loss": 0.2161, "step": 530 }, { "epoch": 0.12706629801753044, "grad_norm": 1.1785417795181274, "learning_rate": 0.0002618733819722287, "loss": 0.1952, "step": 540 }, { "epoch": 0.12941937761044767, "grad_norm": 2.5084645748138428, "learning_rate": 0.00026116733349023296, "loss": 0.204, "step": 550 }, { "epoch": 0.1317724572033649, "grad_norm": 0.9784884452819824, "learning_rate": 0.0002604612850082372, "loss": 0.1598, "step": 560 }, { "epoch": 0.13412553679628214, "grad_norm": 0.8020937442779541, "learning_rate": 0.00025975523652624143, "loss": 0.2045, "step": 570 }, { "epoch": 0.13647861638919936, "grad_norm": 0.9151997566223145, "learning_rate": 0.00025904918804424567, "loss": 0.1744, "step": 580 }, { "epoch": 0.1388316959821166, "grad_norm": 1.55955171585083, "learning_rate": 0.0002583431395622499, "loss": 0.1878, "step": 590 }, { "epoch": 0.14118477557503384, "grad_norm": 23.52423858642578, "learning_rate": 0.0002576370910802542, "loss": 0.1779, "step": 600 }, { "epoch": 0.14353785516795106, "grad_norm": 1.1516189575195312, "learning_rate": 0.0002569310425982584, "loss": 0.195, "step": 610 }, { "epoch": 0.14589093476086828, "grad_norm": 1.0912541151046753, "learning_rate": 0.00025622499411626266, "loss": 0.205, "step": 620 }, { "epoch": 0.1482440143537855, "grad_norm": 1.2680310010910034, "learning_rate": 0.00025551894563426684, "loss": 0.1654, "step": 630 }, { "epoch": 0.15059709394670276, "grad_norm": 1.2099932432174683, "learning_rate": 0.00025481289715227113, "loss": 0.1698, "step": 640 }, { "epoch": 0.15295017353961998, "grad_norm": 1.1155511140823364, "learning_rate": 0.0002541068486702753, "loss": 0.1883, "step": 650 }, { "epoch": 0.1553032531325372, "grad_norm": 1.2237110137939453, "learning_rate": 0.0002534008001882796, "loss": 0.1739, "step": 660 }, { "epoch": 0.15765633272545443, "grad_norm": 2.2334392070770264, "learning_rate": 0.0002526947517062838, "loss": 0.2149, "step": 670 }, { "epoch": 0.16000941231837168, "grad_norm": 1.0051536560058594, "learning_rate": 0.0002519887032242881, "loss": 0.1755, "step": 680 }, { "epoch": 0.1623624919112889, "grad_norm": 1.5381518602371216, "learning_rate": 0.00025128265474229225, "loss": 0.1814, "step": 690 }, { "epoch": 0.16471557150420613, "grad_norm": 1.3390990495681763, "learning_rate": 0.00025057660626029654, "loss": 0.1866, "step": 700 }, { "epoch": 0.16706865109712335, "grad_norm": 1.4517531394958496, "learning_rate": 0.0002498705577783007, "loss": 0.1928, "step": 710 }, { "epoch": 0.1694217306900406, "grad_norm": 1.4081028699874878, "learning_rate": 0.000249164509296305, "loss": 0.1714, "step": 720 }, { "epoch": 0.17177481028295782, "grad_norm": 1.357934832572937, "learning_rate": 0.00024845846081430925, "loss": 0.1856, "step": 730 }, { "epoch": 0.17412788987587505, "grad_norm": 1.043090581893921, "learning_rate": 0.0002477524123323135, "loss": 0.1495, "step": 740 }, { "epoch": 0.17648096946879227, "grad_norm": 1.2053163051605225, "learning_rate": 0.0002470463638503177, "loss": 0.1517, "step": 750 }, { "epoch": 0.17883404906170952, "grad_norm": 2.3474409580230713, "learning_rate": 0.00024634031536832195, "loss": 0.1882, "step": 760 }, { "epoch": 0.18118712865462674, "grad_norm": 0.8380926847457886, "learning_rate": 0.0002456342668863262, "loss": 0.1796, "step": 770 }, { "epoch": 0.18354020824754397, "grad_norm": 1.3997254371643066, "learning_rate": 0.0002449282184043304, "loss": 0.1843, "step": 780 }, { "epoch": 0.1858932878404612, "grad_norm": 1.3143609762191772, "learning_rate": 0.00024422216992233466, "loss": 0.1629, "step": 790 }, { "epoch": 0.18824636743337844, "grad_norm": 0.9414114952087402, "learning_rate": 0.0002435161214403389, "loss": 0.1583, "step": 800 }, { "epoch": 0.19059944702629567, "grad_norm": 1.0523838996887207, "learning_rate": 0.0002428100729583431, "loss": 0.194, "step": 810 }, { "epoch": 0.1929525266192129, "grad_norm": 1.0871750116348267, "learning_rate": 0.00024210402447634737, "loss": 0.15, "step": 820 }, { "epoch": 0.1953056062121301, "grad_norm": 1.515932321548462, "learning_rate": 0.0002413979759943516, "loss": 0.1895, "step": 830 }, { "epoch": 0.19765868580504736, "grad_norm": 0.7211456298828125, "learning_rate": 0.00024069192751235584, "loss": 0.1685, "step": 840 }, { "epoch": 0.20001176539796459, "grad_norm": 0.7664592862129211, "learning_rate": 0.00023998587903036007, "loss": 0.1658, "step": 850 }, { "epoch": 0.2023648449908818, "grad_norm": 0.8728657960891724, "learning_rate": 0.0002392798305483643, "loss": 0.1468, "step": 860 }, { "epoch": 0.20471792458379906, "grad_norm": 1.3027325868606567, "learning_rate": 0.00023857378206636854, "loss": 0.1633, "step": 870 }, { "epoch": 0.20707100417671628, "grad_norm": 1.1061084270477295, "learning_rate": 0.00023786773358437278, "loss": 0.1861, "step": 880 }, { "epoch": 0.2094240837696335, "grad_norm": 1.176365613937378, "learning_rate": 0.000237161685102377, "loss": 0.1589, "step": 890 }, { "epoch": 0.21177716336255073, "grad_norm": 0.8307468295097351, "learning_rate": 0.00023645563662038127, "loss": 0.172, "step": 900 }, { "epoch": 0.21413024295546798, "grad_norm": 1.2759816646575928, "learning_rate": 0.00023574958813838548, "loss": 0.1475, "step": 910 }, { "epoch": 0.2164833225483852, "grad_norm": 1.661071538925171, "learning_rate": 0.00023504353965638974, "loss": 0.2048, "step": 920 }, { "epoch": 0.21883640214130243, "grad_norm": 1.3144210577011108, "learning_rate": 0.00023433749117439395, "loss": 0.1582, "step": 930 }, { "epoch": 0.22118948173421965, "grad_norm": 1.1830146312713623, "learning_rate": 0.00023363144269239821, "loss": 0.1567, "step": 940 }, { "epoch": 0.2235425613271369, "grad_norm": 0.7755473256111145, "learning_rate": 0.00023292539421040242, "loss": 0.1369, "step": 950 }, { "epoch": 0.22589564092005412, "grad_norm": 0.708152711391449, "learning_rate": 0.00023221934572840666, "loss": 0.1477, "step": 960 }, { "epoch": 0.22824872051297135, "grad_norm": 0.9567592144012451, "learning_rate": 0.0002315132972464109, "loss": 0.1685, "step": 970 }, { "epoch": 0.23060180010588857, "grad_norm": 1.019717812538147, "learning_rate": 0.00023080724876441513, "loss": 0.1485, "step": 980 }, { "epoch": 0.23295487969880582, "grad_norm": 3.704050302505493, "learning_rate": 0.00023010120028241936, "loss": 0.182, "step": 990 }, { "epoch": 0.23530795929172305, "grad_norm": 1.3113001585006714, "learning_rate": 0.0002293951518004236, "loss": 0.2033, "step": 1000 }, { "epoch": 0.23766103888464027, "grad_norm": 1.586300253868103, "learning_rate": 0.00022868910331842783, "loss": 0.1658, "step": 1010 }, { "epoch": 0.2400141184775575, "grad_norm": 0.7029755711555481, "learning_rate": 0.00022798305483643207, "loss": 0.1675, "step": 1020 }, { "epoch": 0.24236719807047474, "grad_norm": 0.9558175802230835, "learning_rate": 0.00022727700635443633, "loss": 0.1292, "step": 1030 }, { "epoch": 0.24472027766339197, "grad_norm": 0.552598774433136, "learning_rate": 0.00022657095787244054, "loss": 0.1271, "step": 1040 }, { "epoch": 0.2470733572563092, "grad_norm": 1.160657525062561, "learning_rate": 0.0002258649093904448, "loss": 0.1406, "step": 1050 }, { "epoch": 0.2494264368492264, "grad_norm": 0.9359754323959351, "learning_rate": 0.000225158860908449, "loss": 0.1456, "step": 1060 }, { "epoch": 0.25177951644214364, "grad_norm": 0.6799198985099792, "learning_rate": 0.00022445281242645327, "loss": 0.1235, "step": 1070 }, { "epoch": 0.2541325960350609, "grad_norm": 0.97700434923172, "learning_rate": 0.00022374676394445748, "loss": 0.1721, "step": 1080 }, { "epoch": 0.25648567562797814, "grad_norm": 0.6762118935585022, "learning_rate": 0.00022304071546246174, "loss": 0.2006, "step": 1090 }, { "epoch": 0.25883875522089533, "grad_norm": 0.6071228384971619, "learning_rate": 0.00022233466698046595, "loss": 0.1647, "step": 1100 }, { "epoch": 0.2611918348138126, "grad_norm": 0.7097590565681458, "learning_rate": 0.0002216286184984702, "loss": 0.1662, "step": 1110 }, { "epoch": 0.2635449144067298, "grad_norm": 0.48786184191703796, "learning_rate": 0.00022092257001647442, "loss": 0.1378, "step": 1120 }, { "epoch": 0.26589799399964703, "grad_norm": 0.7238913178443909, "learning_rate": 0.00022021652153447868, "loss": 0.163, "step": 1130 }, { "epoch": 0.2682510735925643, "grad_norm": 1.3571726083755493, "learning_rate": 0.00021951047305248292, "loss": 0.1545, "step": 1140 }, { "epoch": 0.2706041531854815, "grad_norm": 0.6683372259140015, "learning_rate": 0.00021880442457048715, "loss": 0.1375, "step": 1150 }, { "epoch": 0.27295723277839873, "grad_norm": 1.9159690141677856, "learning_rate": 0.0002180983760884914, "loss": 0.1604, "step": 1160 }, { "epoch": 0.275310312371316, "grad_norm": 1.6136759519577026, "learning_rate": 0.00021739232760649562, "loss": 0.1827, "step": 1170 }, { "epoch": 0.2776633919642332, "grad_norm": 1.2445416450500488, "learning_rate": 0.00021668627912449986, "loss": 0.1283, "step": 1180 }, { "epoch": 0.2800164715571504, "grad_norm": 1.143410563468933, "learning_rate": 0.0002159802306425041, "loss": 0.1571, "step": 1190 }, { "epoch": 0.2823695511500677, "grad_norm": 0.641952633857727, "learning_rate": 0.00021527418216050833, "loss": 0.1511, "step": 1200 }, { "epoch": 0.2847226307429849, "grad_norm": 0.9618122577667236, "learning_rate": 0.00021456813367851256, "loss": 0.1251, "step": 1210 }, { "epoch": 0.2870757103359021, "grad_norm": 1.040390133857727, "learning_rate": 0.0002138620851965168, "loss": 0.1481, "step": 1220 }, { "epoch": 0.2894287899288193, "grad_norm": 2.470360279083252, "learning_rate": 0.00021315603671452106, "loss": 0.1523, "step": 1230 }, { "epoch": 0.29178186952173657, "grad_norm": 1.15378737449646, "learning_rate": 0.00021244998823252527, "loss": 0.1526, "step": 1240 }, { "epoch": 0.2941349491146538, "grad_norm": 1.2236779928207397, "learning_rate": 0.00021174393975052953, "loss": 0.15, "step": 1250 }, { "epoch": 0.296488028707571, "grad_norm": 0.6974225640296936, "learning_rate": 0.00021103789126853374, "loss": 0.1529, "step": 1260 }, { "epoch": 0.29884110830048827, "grad_norm": 1.2019627094268799, "learning_rate": 0.000210331842786538, "loss": 0.1534, "step": 1270 }, { "epoch": 0.3011941878934055, "grad_norm": 1.5245829820632935, "learning_rate": 0.0002096257943045422, "loss": 0.1452, "step": 1280 }, { "epoch": 0.3035472674863227, "grad_norm": 1.5062931776046753, "learning_rate": 0.00020891974582254647, "loss": 0.1617, "step": 1290 }, { "epoch": 0.30590034707923996, "grad_norm": 0.5989176034927368, "learning_rate": 0.00020821369734055068, "loss": 0.1567, "step": 1300 }, { "epoch": 0.30825342667215716, "grad_norm": 1.1063286066055298, "learning_rate": 0.00020750764885855494, "loss": 0.1651, "step": 1310 }, { "epoch": 0.3106065062650744, "grad_norm": 0.9815717935562134, "learning_rate": 0.00020680160037655915, "loss": 0.1485, "step": 1320 }, { "epoch": 0.31295958585799166, "grad_norm": 1.218807578086853, "learning_rate": 0.0002060955518945634, "loss": 0.1151, "step": 1330 }, { "epoch": 0.31531266545090886, "grad_norm": 1.1629014015197754, "learning_rate": 0.00020538950341256765, "loss": 0.1406, "step": 1340 }, { "epoch": 0.3176657450438261, "grad_norm": 0.6818956732749939, "learning_rate": 0.00020468345493057188, "loss": 0.1465, "step": 1350 }, { "epoch": 0.32001882463674336, "grad_norm": 0.7869308590888977, "learning_rate": 0.00020397740644857612, "loss": 0.1515, "step": 1360 }, { "epoch": 0.32237190422966056, "grad_norm": 1.023478627204895, "learning_rate": 0.00020327135796658035, "loss": 0.1781, "step": 1370 }, { "epoch": 0.3247249838225778, "grad_norm": 1.0383384227752686, "learning_rate": 0.0002025653094845846, "loss": 0.1195, "step": 1380 }, { "epoch": 0.327078063415495, "grad_norm": 1.5291595458984375, "learning_rate": 0.00020185926100258882, "loss": 0.1334, "step": 1390 }, { "epoch": 0.32943114300841225, "grad_norm": 0.9488996267318726, "learning_rate": 0.00020115321252059306, "loss": 0.1368, "step": 1400 }, { "epoch": 0.3317842226013295, "grad_norm": 1.1703331470489502, "learning_rate": 0.0002004471640385973, "loss": 0.131, "step": 1410 }, { "epoch": 0.3341373021942467, "grad_norm": 0.6122885346412659, "learning_rate": 0.00019974111555660153, "loss": 0.1356, "step": 1420 }, { "epoch": 0.33649038178716395, "grad_norm": 0.7869921326637268, "learning_rate": 0.0001990350670746058, "loss": 0.1817, "step": 1430 }, { "epoch": 0.3388434613800812, "grad_norm": 0.691066324710846, "learning_rate": 0.00019832901859261, "loss": 0.131, "step": 1440 }, { "epoch": 0.3411965409729984, "grad_norm": 1.4205127954483032, "learning_rate": 0.00019762297011061426, "loss": 0.1366, "step": 1450 }, { "epoch": 0.34354962056591565, "grad_norm": 0.47127053141593933, "learning_rate": 0.00019691692162861847, "loss": 0.1498, "step": 1460 }, { "epoch": 0.3459027001588329, "grad_norm": 0.9336820840835571, "learning_rate": 0.00019621087314662273, "loss": 0.1512, "step": 1470 }, { "epoch": 0.3482557797517501, "grad_norm": 0.8124200105667114, "learning_rate": 0.00019550482466462694, "loss": 0.1319, "step": 1480 }, { "epoch": 0.35060885934466735, "grad_norm": 0.6921178698539734, "learning_rate": 0.0001947987761826312, "loss": 0.1279, "step": 1490 }, { "epoch": 0.35296193893758454, "grad_norm": 1.336229681968689, "learning_rate": 0.0001940927277006354, "loss": 0.1251, "step": 1500 }, { "epoch": 0.3553150185305018, "grad_norm": 0.9984803795814514, "learning_rate": 0.00019338667921863967, "loss": 0.1299, "step": 1510 }, { "epoch": 0.35766809812341904, "grad_norm": 1.0903042554855347, "learning_rate": 0.00019268063073664388, "loss": 0.1528, "step": 1520 }, { "epoch": 0.36002117771633624, "grad_norm": 0.666950523853302, "learning_rate": 0.00019197458225464814, "loss": 0.1446, "step": 1530 }, { "epoch": 0.3623742573092535, "grad_norm": 0.8104845285415649, "learning_rate": 0.00019126853377265238, "loss": 0.1221, "step": 1540 }, { "epoch": 0.36472733690217074, "grad_norm": 0.5904582738876343, "learning_rate": 0.00019056248529065661, "loss": 0.1164, "step": 1550 }, { "epoch": 0.36708041649508794, "grad_norm": 0.7703972458839417, "learning_rate": 0.00018985643680866085, "loss": 0.0978, "step": 1560 }, { "epoch": 0.3694334960880052, "grad_norm": 1.9245415925979614, "learning_rate": 0.00018915038832666508, "loss": 0.1624, "step": 1570 }, { "epoch": 0.3717865756809224, "grad_norm": 1.6459194421768188, "learning_rate": 0.00018844433984466932, "loss": 0.1289, "step": 1580 }, { "epoch": 0.37413965527383963, "grad_norm": 1.6774044036865234, "learning_rate": 0.00018773829136267355, "loss": 0.1468, "step": 1590 }, { "epoch": 0.3764927348667569, "grad_norm": 1.5878580808639526, "learning_rate": 0.0001870322428806778, "loss": 0.1318, "step": 1600 }, { "epoch": 0.3788458144596741, "grad_norm": 0.7039738297462463, "learning_rate": 0.00018632619439868203, "loss": 0.1242, "step": 1610 }, { "epoch": 0.38119889405259133, "grad_norm": 1.1770200729370117, "learning_rate": 0.00018562014591668626, "loss": 0.1321, "step": 1620 }, { "epoch": 0.3835519736455086, "grad_norm": 2.2201638221740723, "learning_rate": 0.00018491409743469052, "loss": 0.1214, "step": 1630 }, { "epoch": 0.3859050532384258, "grad_norm": 0.756149411201477, "learning_rate": 0.00018420804895269473, "loss": 0.1219, "step": 1640 }, { "epoch": 0.38825813283134303, "grad_norm": 0.5444088578224182, "learning_rate": 0.000183502000470699, "loss": 0.1346, "step": 1650 }, { "epoch": 0.3906112124242602, "grad_norm": 0.7643070816993713, "learning_rate": 0.0001827959519887032, "loss": 0.1324, "step": 1660 }, { "epoch": 0.3929642920171775, "grad_norm": 0.885362446308136, "learning_rate": 0.00018208990350670746, "loss": 0.1166, "step": 1670 }, { "epoch": 0.3953173716100947, "grad_norm": 0.7135679721832275, "learning_rate": 0.00018138385502471167, "loss": 0.1364, "step": 1680 }, { "epoch": 0.3976704512030119, "grad_norm": 0.5533025860786438, "learning_rate": 0.00018067780654271593, "loss": 0.1137, "step": 1690 }, { "epoch": 0.40002353079592917, "grad_norm": 0.5916281342506409, "learning_rate": 0.00017997175806072014, "loss": 0.1131, "step": 1700 }, { "epoch": 0.4023766103888464, "grad_norm": 0.8299354314804077, "learning_rate": 0.0001792657095787244, "loss": 0.1331, "step": 1710 }, { "epoch": 0.4047296899817636, "grad_norm": 0.7944399118423462, "learning_rate": 0.0001785596610967286, "loss": 0.1049, "step": 1720 }, { "epoch": 0.40708276957468087, "grad_norm": 0.6967952251434326, "learning_rate": 0.00017785361261473287, "loss": 0.0997, "step": 1730 }, { "epoch": 0.4094358491675981, "grad_norm": 0.42431318759918213, "learning_rate": 0.0001771475641327371, "loss": 0.0964, "step": 1740 }, { "epoch": 0.4117889287605153, "grad_norm": 0.6767364740371704, "learning_rate": 0.00017644151565074134, "loss": 0.1627, "step": 1750 }, { "epoch": 0.41414200835343257, "grad_norm": 1.0430301427841187, "learning_rate": 0.00017573546716874558, "loss": 0.1173, "step": 1760 }, { "epoch": 0.41649508794634976, "grad_norm": 0.6168161034584045, "learning_rate": 0.00017502941868674981, "loss": 0.1229, "step": 1770 }, { "epoch": 0.418848167539267, "grad_norm": 1.9067519903182983, "learning_rate": 0.00017432337020475405, "loss": 0.1369, "step": 1780 }, { "epoch": 0.42120124713218426, "grad_norm": 1.5157831907272339, "learning_rate": 0.00017361732172275829, "loss": 0.1243, "step": 1790 }, { "epoch": 0.42355432672510146, "grad_norm": 1.5152102708816528, "learning_rate": 0.00017291127324076252, "loss": 0.1395, "step": 1800 }, { "epoch": 0.4259074063180187, "grad_norm": 0.8262742161750793, "learning_rate": 0.00017220522475876676, "loss": 0.1467, "step": 1810 }, { "epoch": 0.42826048591093596, "grad_norm": 0.5484256744384766, "learning_rate": 0.000171499176276771, "loss": 0.1405, "step": 1820 }, { "epoch": 0.43061356550385316, "grad_norm": 0.7796267867088318, "learning_rate": 0.00017079312779477525, "loss": 0.1508, "step": 1830 }, { "epoch": 0.4329666450967704, "grad_norm": 0.7360082268714905, "learning_rate": 0.00017008707931277946, "loss": 0.1332, "step": 1840 }, { "epoch": 0.4353197246896876, "grad_norm": 0.8352281451225281, "learning_rate": 0.00016938103083078372, "loss": 0.1343, "step": 1850 }, { "epoch": 0.43767280428260485, "grad_norm": 0.6898388862609863, "learning_rate": 0.00016867498234878793, "loss": 0.0983, "step": 1860 }, { "epoch": 0.4400258838755221, "grad_norm": 0.3843238651752472, "learning_rate": 0.0001679689338667922, "loss": 0.1091, "step": 1870 }, { "epoch": 0.4423789634684393, "grad_norm": 0.7791532278060913, "learning_rate": 0.0001672628853847964, "loss": 0.1321, "step": 1880 }, { "epoch": 0.44473204306135655, "grad_norm": 0.9906323552131653, "learning_rate": 0.00016655683690280064, "loss": 0.1125, "step": 1890 }, { "epoch": 0.4470851226542738, "grad_norm": 0.631594181060791, "learning_rate": 0.00016585078842080487, "loss": 0.1328, "step": 1900 }, { "epoch": 0.449438202247191, "grad_norm": 1.4922380447387695, "learning_rate": 0.0001651447399388091, "loss": 0.1441, "step": 1910 }, { "epoch": 0.45179128184010825, "grad_norm": 0.6896445751190186, "learning_rate": 0.00016443869145681334, "loss": 0.146, "step": 1920 }, { "epoch": 0.45414436143302545, "grad_norm": 0.6470409035682678, "learning_rate": 0.00016373264297481758, "loss": 0.1123, "step": 1930 }, { "epoch": 0.4564974410259427, "grad_norm": 1.4532804489135742, "learning_rate": 0.00016302659449282184, "loss": 0.1204, "step": 1940 }, { "epoch": 0.45885052061885995, "grad_norm": 1.5582534074783325, "learning_rate": 0.00016232054601082605, "loss": 0.1275, "step": 1950 }, { "epoch": 0.46120360021177714, "grad_norm": 0.7568921446800232, "learning_rate": 0.0001616144975288303, "loss": 0.1373, "step": 1960 }, { "epoch": 0.4635566798046944, "grad_norm": 0.7904714941978455, "learning_rate": 0.00016090844904683452, "loss": 0.1281, "step": 1970 }, { "epoch": 0.46590975939761164, "grad_norm": 0.48104897141456604, "learning_rate": 0.00016020240056483878, "loss": 0.1173, "step": 1980 }, { "epoch": 0.46826283899052884, "grad_norm": 0.6676899194717407, "learning_rate": 0.000159496352082843, "loss": 0.1297, "step": 1990 }, { "epoch": 0.4706159185834461, "grad_norm": 0.7035501599311829, "learning_rate": 0.00015879030360084725, "loss": 0.1246, "step": 2000 }, { "epoch": 0.47296899817636334, "grad_norm": 1.289421796798706, "learning_rate": 0.00015808425511885146, "loss": 0.1501, "step": 2010 }, { "epoch": 0.47532207776928054, "grad_norm": 0.6186831593513489, "learning_rate": 0.00015737820663685572, "loss": 0.1156, "step": 2020 }, { "epoch": 0.4776751573621978, "grad_norm": 0.7897233963012695, "learning_rate": 0.00015667215815485993, "loss": 0.1317, "step": 2030 }, { "epoch": 0.480028236955115, "grad_norm": 1.1652599573135376, "learning_rate": 0.0001559661096728642, "loss": 0.1325, "step": 2040 }, { "epoch": 0.48238131654803224, "grad_norm": 0.6400769948959351, "learning_rate": 0.0001552600611908684, "loss": 0.1002, "step": 2050 }, { "epoch": 0.4847343961409495, "grad_norm": 0.5541133880615234, "learning_rate": 0.00015455401270887266, "loss": 0.1232, "step": 2060 }, { "epoch": 0.4870874757338667, "grad_norm": 0.605411946773529, "learning_rate": 0.0001538479642268769, "loss": 0.1102, "step": 2070 }, { "epoch": 0.48944055532678393, "grad_norm": 0.49058374762535095, "learning_rate": 0.00015314191574488113, "loss": 0.1228, "step": 2080 }, { "epoch": 0.4917936349197012, "grad_norm": 0.7565241456031799, "learning_rate": 0.00015243586726288537, "loss": 0.1009, "step": 2090 }, { "epoch": 0.4941467145126184, "grad_norm": 0.4517477750778198, "learning_rate": 0.0001517298187808896, "loss": 0.129, "step": 2100 }, { "epoch": 0.49649979410553563, "grad_norm": 0.7871853709220886, "learning_rate": 0.00015102377029889384, "loss": 0.1049, "step": 2110 }, { "epoch": 0.4988528736984528, "grad_norm": 0.4314168691635132, "learning_rate": 0.00015031772181689807, "loss": 0.117, "step": 2120 }, { "epoch": 0.5012059532913701, "grad_norm": 0.8347052335739136, "learning_rate": 0.0001496116733349023, "loss": 0.1336, "step": 2130 }, { "epoch": 0.5035590328842873, "grad_norm": 0.42039480805397034, "learning_rate": 0.00014890562485290657, "loss": 0.1176, "step": 2140 }, { "epoch": 0.5059121124772046, "grad_norm": 1.1371684074401855, "learning_rate": 0.0001481995763709108, "loss": 0.1362, "step": 2150 }, { "epoch": 0.5082651920701218, "grad_norm": 0.8690921664237976, "learning_rate": 0.00014749352788891504, "loss": 0.1444, "step": 2160 }, { "epoch": 0.510618271663039, "grad_norm": 0.3952578604221344, "learning_rate": 0.00014678747940691928, "loss": 0.1411, "step": 2170 }, { "epoch": 0.5129713512559563, "grad_norm": 1.0104624032974243, "learning_rate": 0.0001460814309249235, "loss": 0.1127, "step": 2180 }, { "epoch": 0.5153244308488735, "grad_norm": 0.7708560824394226, "learning_rate": 0.00014537538244292775, "loss": 0.1364, "step": 2190 }, { "epoch": 0.5176775104417907, "grad_norm": 3.323113203048706, "learning_rate": 0.00014466933396093198, "loss": 0.1326, "step": 2200 }, { "epoch": 0.520030590034708, "grad_norm": 0.5021951198577881, "learning_rate": 0.00014396328547893622, "loss": 0.1075, "step": 2210 }, { "epoch": 0.5223836696276252, "grad_norm": 0.5558544397354126, "learning_rate": 0.00014325723699694045, "loss": 0.1019, "step": 2220 }, { "epoch": 0.5247367492205424, "grad_norm": 0.7476164102554321, "learning_rate": 0.0001425511885149447, "loss": 0.108, "step": 2230 }, { "epoch": 0.5270898288134596, "grad_norm": 0.8783542513847351, "learning_rate": 0.00014184514003294892, "loss": 0.1182, "step": 2240 }, { "epoch": 0.5294429084063769, "grad_norm": 0.5716719627380371, "learning_rate": 0.00014113909155095316, "loss": 0.1048, "step": 2250 }, { "epoch": 0.5317959879992941, "grad_norm": 0.41919055581092834, "learning_rate": 0.0001404330430689574, "loss": 0.1071, "step": 2260 }, { "epoch": 0.5341490675922113, "grad_norm": 0.672885537147522, "learning_rate": 0.00013972699458696163, "loss": 0.1333, "step": 2270 }, { "epoch": 0.5365021471851286, "grad_norm": 0.7414030432701111, "learning_rate": 0.00013902094610496586, "loss": 0.1288, "step": 2280 }, { "epoch": 0.5388552267780458, "grad_norm": 1.1601518392562866, "learning_rate": 0.0001383148976229701, "loss": 0.1099, "step": 2290 }, { "epoch": 0.541208306370963, "grad_norm": 0.4423375129699707, "learning_rate": 0.00013760884914097433, "loss": 0.1049, "step": 2300 }, { "epoch": 0.5435613859638803, "grad_norm": 0.9248809218406677, "learning_rate": 0.00013690280065897857, "loss": 0.1172, "step": 2310 }, { "epoch": 0.5459144655567975, "grad_norm": 1.3502943515777588, "learning_rate": 0.0001361967521769828, "loss": 0.1303, "step": 2320 }, { "epoch": 0.5482675451497147, "grad_norm": 1.488297939300537, "learning_rate": 0.00013549070369498704, "loss": 0.1182, "step": 2330 }, { "epoch": 0.550620624742632, "grad_norm": 0.6636572480201721, "learning_rate": 0.0001347846552129913, "loss": 0.1233, "step": 2340 }, { "epoch": 0.5529737043355492, "grad_norm": 0.5864549279212952, "learning_rate": 0.00013407860673099554, "loss": 0.1102, "step": 2350 }, { "epoch": 0.5553267839284663, "grad_norm": 1.9224406480789185, "learning_rate": 0.00013337255824899977, "loss": 0.1449, "step": 2360 }, { "epoch": 0.5576798635213837, "grad_norm": 1.1239560842514038, "learning_rate": 0.00013266650976700398, "loss": 0.1155, "step": 2370 }, { "epoch": 0.5600329431143009, "grad_norm": 0.6336050629615784, "learning_rate": 0.00013196046128500821, "loss": 0.1193, "step": 2380 }, { "epoch": 0.562386022707218, "grad_norm": 0.9129360914230347, "learning_rate": 0.00013125441280301245, "loss": 0.1121, "step": 2390 }, { "epoch": 0.5647391023001354, "grad_norm": 0.6220555305480957, "learning_rate": 0.00013054836432101668, "loss": 0.1172, "step": 2400 }, { "epoch": 0.5670921818930525, "grad_norm": 0.8981531262397766, "learning_rate": 0.00012984231583902092, "loss": 0.1184, "step": 2410 }, { "epoch": 0.5694452614859697, "grad_norm": 0.7610392570495605, "learning_rate": 0.00012913626735702515, "loss": 0.1204, "step": 2420 }, { "epoch": 0.5717983410788869, "grad_norm": 0.5133729577064514, "learning_rate": 0.0001284302188750294, "loss": 0.1081, "step": 2430 }, { "epoch": 0.5741514206718042, "grad_norm": 0.8097817897796631, "learning_rate": 0.00012772417039303363, "loss": 0.1142, "step": 2440 }, { "epoch": 0.5765045002647214, "grad_norm": 1.8712083101272583, "learning_rate": 0.00012701812191103786, "loss": 0.1234, "step": 2450 }, { "epoch": 0.5788575798576386, "grad_norm": 0.8425026535987854, "learning_rate": 0.00012631207342904212, "loss": 0.1027, "step": 2460 }, { "epoch": 0.5812106594505559, "grad_norm": 0.5562009811401367, "learning_rate": 0.00012560602494704636, "loss": 0.0916, "step": 2470 }, { "epoch": 0.5835637390434731, "grad_norm": 0.45057183504104614, "learning_rate": 0.0001248999764650506, "loss": 0.1166, "step": 2480 }, { "epoch": 0.5859168186363903, "grad_norm": 0.5411068797111511, "learning_rate": 0.00012419392798305483, "loss": 0.1254, "step": 2490 }, { "epoch": 0.5882698982293076, "grad_norm": 0.9400952458381653, "learning_rate": 0.00012348787950105906, "loss": 0.1411, "step": 2500 }, { "epoch": 0.5906229778222248, "grad_norm": 0.4275170564651489, "learning_rate": 0.0001227818310190633, "loss": 0.1089, "step": 2510 }, { "epoch": 0.592976057415142, "grad_norm": 1.2033214569091797, "learning_rate": 0.00012207578253706753, "loss": 0.108, "step": 2520 }, { "epoch": 0.5953291370080593, "grad_norm": 1.257379412651062, "learning_rate": 0.00012136973405507177, "loss": 0.1207, "step": 2530 }, { "epoch": 0.5976822166009765, "grad_norm": 0.7070032954216003, "learning_rate": 0.000120663685573076, "loss": 0.0879, "step": 2540 }, { "epoch": 0.6000352961938937, "grad_norm": 0.8550868034362793, "learning_rate": 0.00011995763709108024, "loss": 0.1087, "step": 2550 }, { "epoch": 0.602388375786811, "grad_norm": 0.8301357626914978, "learning_rate": 0.00011925158860908447, "loss": 0.1266, "step": 2560 }, { "epoch": 0.6047414553797282, "grad_norm": 0.4070800542831421, "learning_rate": 0.00011854554012708871, "loss": 0.1062, "step": 2570 }, { "epoch": 0.6070945349726454, "grad_norm": 1.1967391967773438, "learning_rate": 0.00011783949164509294, "loss": 0.1147, "step": 2580 }, { "epoch": 0.6094476145655627, "grad_norm": 0.5281302332878113, "learning_rate": 0.00011713344316309718, "loss": 0.0912, "step": 2590 }, { "epoch": 0.6118006941584799, "grad_norm": 0.5271784067153931, "learning_rate": 0.00011642739468110142, "loss": 0.1084, "step": 2600 }, { "epoch": 0.6141537737513971, "grad_norm": 0.4973151683807373, "learning_rate": 0.00011572134619910566, "loss": 0.1242, "step": 2610 }, { "epoch": 0.6165068533443143, "grad_norm": 0.4281303882598877, "learning_rate": 0.0001150152977171099, "loss": 0.1101, "step": 2620 }, { "epoch": 0.6188599329372316, "grad_norm": 0.5142689347267151, "learning_rate": 0.00011430924923511413, "loss": 0.1182, "step": 2630 }, { "epoch": 0.6212130125301488, "grad_norm": 0.5125661492347717, "learning_rate": 0.00011360320075311837, "loss": 0.0943, "step": 2640 }, { "epoch": 0.623566092123066, "grad_norm": 0.43077680468559265, "learning_rate": 0.0001128971522711226, "loss": 0.0948, "step": 2650 }, { "epoch": 0.6259191717159833, "grad_norm": 0.5074141621589661, "learning_rate": 0.00011219110378912684, "loss": 0.0853, "step": 2660 }, { "epoch": 0.6282722513089005, "grad_norm": 0.8260855674743652, "learning_rate": 0.00011148505530713107, "loss": 0.1104, "step": 2670 }, { "epoch": 0.6306253309018177, "grad_norm": 0.7819215059280396, "learning_rate": 0.00011077900682513531, "loss": 0.1256, "step": 2680 }, { "epoch": 0.632978410494735, "grad_norm": 0.46884438395500183, "learning_rate": 0.00011007295834313955, "loss": 0.1027, "step": 2690 }, { "epoch": 0.6353314900876522, "grad_norm": 0.9515593647956848, "learning_rate": 0.00010936690986114378, "loss": 0.112, "step": 2700 }, { "epoch": 0.6376845696805694, "grad_norm": 0.3602767586708069, "learning_rate": 0.00010866086137914803, "loss": 0.1053, "step": 2710 }, { "epoch": 0.6400376492734867, "grad_norm": 0.7740781903266907, "learning_rate": 0.00010795481289715226, "loss": 0.112, "step": 2720 }, { "epoch": 0.6423907288664039, "grad_norm": 0.5003033876419067, "learning_rate": 0.0001072487644151565, "loss": 0.0985, "step": 2730 }, { "epoch": 0.6447438084593211, "grad_norm": 0.4092664122581482, "learning_rate": 0.00010654271593316073, "loss": 0.1113, "step": 2740 }, { "epoch": 0.6470968880522384, "grad_norm": 0.446584552526474, "learning_rate": 0.00010583666745116497, "loss": 0.0909, "step": 2750 }, { "epoch": 0.6494499676451556, "grad_norm": 0.3130131661891937, "learning_rate": 0.0001051306189691692, "loss": 0.0954, "step": 2760 }, { "epoch": 0.6518030472380728, "grad_norm": 0.7232083082199097, "learning_rate": 0.00010442457048717344, "loss": 0.1132, "step": 2770 }, { "epoch": 0.65415612683099, "grad_norm": 0.5579691529273987, "learning_rate": 0.00010371852200517768, "loss": 0.1045, "step": 2780 }, { "epoch": 0.6565092064239073, "grad_norm": 0.5319089889526367, "learning_rate": 0.00010301247352318191, "loss": 0.1215, "step": 2790 }, { "epoch": 0.6588622860168245, "grad_norm": 0.516445517539978, "learning_rate": 0.00010230642504118615, "loss": 0.111, "step": 2800 }, { "epoch": 0.6612153656097417, "grad_norm": 0.25264236330986023, "learning_rate": 0.0001016003765591904, "loss": 0.1126, "step": 2810 }, { "epoch": 0.663568445202659, "grad_norm": 0.7910987138748169, "learning_rate": 0.00010089432807719463, "loss": 0.1306, "step": 2820 }, { "epoch": 0.6659215247955762, "grad_norm": 0.7823461890220642, "learning_rate": 0.00010018827959519886, "loss": 0.0967, "step": 2830 }, { "epoch": 0.6682746043884934, "grad_norm": 0.7126127481460571, "learning_rate": 9.94822311132031e-05, "loss": 0.1296, "step": 2840 }, { "epoch": 0.6706276839814107, "grad_norm": 0.9327739477157593, "learning_rate": 9.877618263120733e-05, "loss": 0.1115, "step": 2850 }, { "epoch": 0.6729807635743279, "grad_norm": 0.7680268883705139, "learning_rate": 9.807013414921157e-05, "loss": 0.1055, "step": 2860 }, { "epoch": 0.6753338431672451, "grad_norm": 0.7711540460586548, "learning_rate": 9.73640856672158e-05, "loss": 0.0951, "step": 2870 }, { "epoch": 0.6776869227601624, "grad_norm": 0.5041959881782532, "learning_rate": 9.665803718522004e-05, "loss": 0.1087, "step": 2880 }, { "epoch": 0.6800400023530796, "grad_norm": 0.5102591514587402, "learning_rate": 9.595198870322428e-05, "loss": 0.1176, "step": 2890 }, { "epoch": 0.6823930819459968, "grad_norm": 0.7100384831428528, "learning_rate": 9.524594022122851e-05, "loss": 0.1091, "step": 2900 }, { "epoch": 0.6847461615389141, "grad_norm": 0.6806867122650146, "learning_rate": 9.453989173923276e-05, "loss": 0.1251, "step": 2910 }, { "epoch": 0.6870992411318313, "grad_norm": 0.6659530401229858, "learning_rate": 9.3833843257237e-05, "loss": 0.0835, "step": 2920 }, { "epoch": 0.6894523207247485, "grad_norm": 0.4317012429237366, "learning_rate": 9.312779477524123e-05, "loss": 0.0893, "step": 2930 }, { "epoch": 0.6918054003176658, "grad_norm": 0.5916824340820312, "learning_rate": 9.242174629324546e-05, "loss": 0.0905, "step": 2940 }, { "epoch": 0.694158479910583, "grad_norm": 0.7429795265197754, "learning_rate": 9.17156978112497e-05, "loss": 0.1063, "step": 2950 }, { "epoch": 0.6965115595035002, "grad_norm": 0.87420254945755, "learning_rate": 9.100964932925394e-05, "loss": 0.1042, "step": 2960 }, { "epoch": 0.6988646390964174, "grad_norm": 0.49567267298698425, "learning_rate": 9.030360084725817e-05, "loss": 0.1045, "step": 2970 }, { "epoch": 0.7012177186893347, "grad_norm": 2.633138418197632, "learning_rate": 8.95975523652624e-05, "loss": 0.0884, "step": 2980 }, { "epoch": 0.7035707982822519, "grad_norm": 0.33752286434173584, "learning_rate": 8.889150388326664e-05, "loss": 0.0848, "step": 2990 }, { "epoch": 0.7059238778751691, "grad_norm": 0.5974826812744141, "learning_rate": 8.818545540127088e-05, "loss": 0.0971, "step": 3000 }, { "epoch": 0.7082769574680864, "grad_norm": 0.43427976965904236, "learning_rate": 8.747940691927512e-05, "loss": 0.1165, "step": 3010 }, { "epoch": 0.7106300370610036, "grad_norm": 0.7770646810531616, "learning_rate": 8.677335843727936e-05, "loss": 0.1084, "step": 3020 }, { "epoch": 0.7129831166539208, "grad_norm": 0.5276495218276978, "learning_rate": 8.60673099552836e-05, "loss": 0.111, "step": 3030 }, { "epoch": 0.7153361962468381, "grad_norm": 0.9737383127212524, "learning_rate": 8.536126147328783e-05, "loss": 0.0972, "step": 3040 }, { "epoch": 0.7176892758397553, "grad_norm": 0.36562997102737427, "learning_rate": 8.465521299129207e-05, "loss": 0.093, "step": 3050 }, { "epoch": 0.7200423554326725, "grad_norm": 0.8244528770446777, "learning_rate": 8.39491645092963e-05, "loss": 0.1263, "step": 3060 }, { "epoch": 0.7223954350255898, "grad_norm": 1.9532008171081543, "learning_rate": 8.324311602730054e-05, "loss": 0.1251, "step": 3070 }, { "epoch": 0.724748514618507, "grad_norm": 0.572896420955658, "learning_rate": 8.253706754530477e-05, "loss": 0.0875, "step": 3080 }, { "epoch": 0.7271015942114242, "grad_norm": 1.2975929975509644, "learning_rate": 8.1831019063309e-05, "loss": 0.1023, "step": 3090 }, { "epoch": 0.7294546738043415, "grad_norm": 0.5758102536201477, "learning_rate": 8.112497058131324e-05, "loss": 0.1019, "step": 3100 }, { "epoch": 0.7318077533972587, "grad_norm": 0.553327202796936, "learning_rate": 8.041892209931749e-05, "loss": 0.1128, "step": 3110 }, { "epoch": 0.7341608329901759, "grad_norm": 0.5465438961982727, "learning_rate": 7.971287361732173e-05, "loss": 0.1237, "step": 3120 }, { "epoch": 0.7365139125830932, "grad_norm": 0.46917715668678284, "learning_rate": 7.900682513532596e-05, "loss": 0.0964, "step": 3130 }, { "epoch": 0.7388669921760104, "grad_norm": 0.8454899787902832, "learning_rate": 7.83007766533302e-05, "loss": 0.0901, "step": 3140 }, { "epoch": 0.7412200717689276, "grad_norm": 0.8698781728744507, "learning_rate": 7.759472817133443e-05, "loss": 0.1119, "step": 3150 }, { "epoch": 0.7435731513618448, "grad_norm": 1.7399003505706787, "learning_rate": 7.688867968933867e-05, "loss": 0.1122, "step": 3160 }, { "epoch": 0.7459262309547621, "grad_norm": 0.4506986141204834, "learning_rate": 7.61826312073429e-05, "loss": 0.09, "step": 3170 }, { "epoch": 0.7482793105476793, "grad_norm": 0.7856936454772949, "learning_rate": 7.547658272534714e-05, "loss": 0.1194, "step": 3180 }, { "epoch": 0.7506323901405965, "grad_norm": 0.5778619050979614, "learning_rate": 7.477053424335137e-05, "loss": 0.1167, "step": 3190 }, { "epoch": 0.7529854697335138, "grad_norm": 0.4940952658653259, "learning_rate": 7.40644857613556e-05, "loss": 0.0813, "step": 3200 }, { "epoch": 0.755338549326431, "grad_norm": 1.1496696472167969, "learning_rate": 7.335843727935984e-05, "loss": 0.0966, "step": 3210 }, { "epoch": 0.7576916289193482, "grad_norm": 0.4551859498023987, "learning_rate": 7.265238879736408e-05, "loss": 0.0956, "step": 3220 }, { "epoch": 0.7600447085122655, "grad_norm": 0.5476594567298889, "learning_rate": 7.194634031536831e-05, "loss": 0.1225, "step": 3230 }, { "epoch": 0.7623977881051827, "grad_norm": 0.4413054287433624, "learning_rate": 7.124029183337255e-05, "loss": 0.0996, "step": 3240 }, { "epoch": 0.7647508676980999, "grad_norm": 0.6522489190101624, "learning_rate": 7.053424335137678e-05, "loss": 0.0946, "step": 3250 }, { "epoch": 0.7671039472910172, "grad_norm": 0.4750779867172241, "learning_rate": 6.982819486938102e-05, "loss": 0.0788, "step": 3260 }, { "epoch": 0.7694570268839344, "grad_norm": 0.336505264043808, "learning_rate": 6.912214638738527e-05, "loss": 0.0841, "step": 3270 }, { "epoch": 0.7718101064768516, "grad_norm": 1.4274874925613403, "learning_rate": 6.84160979053895e-05, "loss": 0.0991, "step": 3280 }, { "epoch": 0.7741631860697689, "grad_norm": 0.6464115977287292, "learning_rate": 6.771004942339374e-05, "loss": 0.1172, "step": 3290 }, { "epoch": 0.7765162656626861, "grad_norm": 0.35535725951194763, "learning_rate": 6.700400094139797e-05, "loss": 0.09, "step": 3300 }, { "epoch": 0.7788693452556033, "grad_norm": 0.22626227140426636, "learning_rate": 6.62979524594022e-05, "loss": 0.089, "step": 3310 }, { "epoch": 0.7812224248485204, "grad_norm": 0.6091925501823425, "learning_rate": 6.559190397740644e-05, "loss": 0.0851, "step": 3320 }, { "epoch": 0.7835755044414378, "grad_norm": 2.3381729125976562, "learning_rate": 6.488585549541068e-05, "loss": 0.1001, "step": 3330 }, { "epoch": 0.785928584034355, "grad_norm": 0.41597291827201843, "learning_rate": 6.417980701341491e-05, "loss": 0.0985, "step": 3340 }, { "epoch": 0.7882816636272721, "grad_norm": 0.6187950372695923, "learning_rate": 6.347375853141915e-05, "loss": 0.0877, "step": 3350 }, { "epoch": 0.7906347432201895, "grad_norm": 0.4807620942592621, "learning_rate": 6.276771004942338e-05, "loss": 0.1074, "step": 3360 }, { "epoch": 0.7929878228131066, "grad_norm": 0.2998965382575989, "learning_rate": 6.206166156742763e-05, "loss": 0.1044, "step": 3370 }, { "epoch": 0.7953409024060238, "grad_norm": 0.5904129147529602, "learning_rate": 6.135561308543187e-05, "loss": 0.1073, "step": 3380 }, { "epoch": 0.7976939819989411, "grad_norm": 0.6356788277626038, "learning_rate": 6.06495646034361e-05, "loss": 0.1121, "step": 3390 }, { "epoch": 0.8000470615918583, "grad_norm": 0.9147433638572693, "learning_rate": 5.994351612144034e-05, "loss": 0.1103, "step": 3400 }, { "epoch": 0.8024001411847755, "grad_norm": 0.8032605051994324, "learning_rate": 5.923746763944457e-05, "loss": 0.0965, "step": 3410 }, { "epoch": 0.8047532207776928, "grad_norm": 0.7935906052589417, "learning_rate": 5.853141915744881e-05, "loss": 0.1007, "step": 3420 }, { "epoch": 0.80710630037061, "grad_norm": 0.4112412631511688, "learning_rate": 5.782537067545304e-05, "loss": 0.0882, "step": 3430 }, { "epoch": 0.8094593799635272, "grad_norm": 0.8190514445304871, "learning_rate": 5.7119322193457284e-05, "loss": 0.1019, "step": 3440 }, { "epoch": 0.8118124595564445, "grad_norm": 0.6029698848724365, "learning_rate": 5.641327371146152e-05, "loss": 0.1053, "step": 3450 }, { "epoch": 0.8141655391493617, "grad_norm": 0.43347781896591187, "learning_rate": 5.5707225229465755e-05, "loss": 0.1044, "step": 3460 }, { "epoch": 0.8165186187422789, "grad_norm": 1.5235440731048584, "learning_rate": 5.500117674746999e-05, "loss": 0.0982, "step": 3470 }, { "epoch": 0.8188716983351962, "grad_norm": 0.5716174244880676, "learning_rate": 5.4295128265474225e-05, "loss": 0.1078, "step": 3480 }, { "epoch": 0.8212247779281134, "grad_norm": 1.5008090734481812, "learning_rate": 5.358907978347847e-05, "loss": 0.0915, "step": 3490 }, { "epoch": 0.8235778575210306, "grad_norm": 0.49782001972198486, "learning_rate": 5.28830313014827e-05, "loss": 0.0892, "step": 3500 }, { "epoch": 0.8259309371139478, "grad_norm": 0.4466950297355652, "learning_rate": 5.217698281948694e-05, "loss": 0.0905, "step": 3510 }, { "epoch": 0.8282840167068651, "grad_norm": 0.5504721403121948, "learning_rate": 5.147093433749117e-05, "loss": 0.1128, "step": 3520 }, { "epoch": 0.8306370962997823, "grad_norm": 0.4870951473712921, "learning_rate": 5.076488585549541e-05, "loss": 0.0876, "step": 3530 }, { "epoch": 0.8329901758926995, "grad_norm": 0.6789172887802124, "learning_rate": 5.005883737349965e-05, "loss": 0.1004, "step": 3540 }, { "epoch": 0.8353432554856168, "grad_norm": 0.5021870136260986, "learning_rate": 4.935278889150388e-05, "loss": 0.0862, "step": 3550 }, { "epoch": 0.837696335078534, "grad_norm": 0.5829181671142578, "learning_rate": 4.864674040950811e-05, "loss": 0.0994, "step": 3560 }, { "epoch": 0.8400494146714512, "grad_norm": 1.029181957244873, "learning_rate": 4.794069192751235e-05, "loss": 0.1144, "step": 3570 }, { "epoch": 0.8424024942643685, "grad_norm": 0.6730376482009888, "learning_rate": 4.723464344551658e-05, "loss": 0.106, "step": 3580 }, { "epoch": 0.8447555738572857, "grad_norm": 0.6129499673843384, "learning_rate": 4.652859496352082e-05, "loss": 0.0896, "step": 3590 }, { "epoch": 0.8471086534502029, "grad_norm": 0.422830194234848, "learning_rate": 4.582254648152506e-05, "loss": 0.0846, "step": 3600 }, { "epoch": 0.8494617330431202, "grad_norm": 0.5306664109230042, "learning_rate": 4.5116497999529296e-05, "loss": 0.1059, "step": 3610 }, { "epoch": 0.8518148126360374, "grad_norm": 0.6436883807182312, "learning_rate": 4.441044951753353e-05, "loss": 0.1132, "step": 3620 }, { "epoch": 0.8541678922289546, "grad_norm": 0.4121890962123871, "learning_rate": 4.3704401035537766e-05, "loss": 0.0864, "step": 3630 }, { "epoch": 0.8565209718218719, "grad_norm": 0.42521169781684875, "learning_rate": 4.2998352553542e-05, "loss": 0.1011, "step": 3640 }, { "epoch": 0.8588740514147891, "grad_norm": 0.49623095989227295, "learning_rate": 4.229230407154624e-05, "loss": 0.1093, "step": 3650 }, { "epoch": 0.8612271310077063, "grad_norm": 0.5516742467880249, "learning_rate": 4.158625558955048e-05, "loss": 0.1009, "step": 3660 }, { "epoch": 0.8635802106006236, "grad_norm": 0.37128451466560364, "learning_rate": 4.0880207107554713e-05, "loss": 0.0717, "step": 3670 }, { "epoch": 0.8659332901935408, "grad_norm": 0.3802624046802521, "learning_rate": 4.017415862555895e-05, "loss": 0.0891, "step": 3680 }, { "epoch": 0.868286369786458, "grad_norm": 0.35558944940567017, "learning_rate": 3.9468110143563184e-05, "loss": 0.0863, "step": 3690 }, { "epoch": 0.8706394493793752, "grad_norm": 0.2548139989376068, "learning_rate": 3.8762061661567426e-05, "loss": 0.0946, "step": 3700 }, { "epoch": 0.8729925289722925, "grad_norm": 0.3489900827407837, "learning_rate": 3.805601317957166e-05, "loss": 0.0794, "step": 3710 }, { "epoch": 0.8753456085652097, "grad_norm": 0.7514833807945251, "learning_rate": 3.7349964697575896e-05, "loss": 0.1026, "step": 3720 }, { "epoch": 0.8776986881581269, "grad_norm": 0.28846803307533264, "learning_rate": 3.664391621558013e-05, "loss": 0.107, "step": 3730 }, { "epoch": 0.8800517677510442, "grad_norm": 0.3054257333278656, "learning_rate": 3.5937867733584366e-05, "loss": 0.0839, "step": 3740 }, { "epoch": 0.8824048473439614, "grad_norm": 0.487393856048584, "learning_rate": 3.523181925158861e-05, "loss": 0.099, "step": 3750 }, { "epoch": 0.8847579269368786, "grad_norm": 0.7874276041984558, "learning_rate": 3.4525770769592843e-05, "loss": 0.0873, "step": 3760 }, { "epoch": 0.8871110065297959, "grad_norm": 0.3583498001098633, "learning_rate": 3.381972228759708e-05, "loss": 0.0854, "step": 3770 }, { "epoch": 0.8894640861227131, "grad_norm": 0.5606823563575745, "learning_rate": 3.3113673805601314e-05, "loss": 0.1106, "step": 3780 }, { "epoch": 0.8918171657156303, "grad_norm": 0.48208296298980713, "learning_rate": 3.240762532360555e-05, "loss": 0.1138, "step": 3790 }, { "epoch": 0.8941702453085476, "grad_norm": 1.026995301246643, "learning_rate": 3.170157684160979e-05, "loss": 0.0877, "step": 3800 }, { "epoch": 0.8965233249014648, "grad_norm": 0.7940952777862549, "learning_rate": 3.0995528359614026e-05, "loss": 0.069, "step": 3810 }, { "epoch": 0.898876404494382, "grad_norm": 0.7711090445518494, "learning_rate": 3.028947987761826e-05, "loss": 0.0884, "step": 3820 }, { "epoch": 0.9012294840872993, "grad_norm": 0.6985650062561035, "learning_rate": 2.9583431395622496e-05, "loss": 0.0929, "step": 3830 }, { "epoch": 0.9035825636802165, "grad_norm": 0.5291894674301147, "learning_rate": 2.8877382913626735e-05, "loss": 0.1166, "step": 3840 }, { "epoch": 0.9059356432731337, "grad_norm": 0.3929837644100189, "learning_rate": 2.817133443163097e-05, "loss": 0.0904, "step": 3850 }, { "epoch": 0.9082887228660509, "grad_norm": 0.492017537355423, "learning_rate": 2.746528594963521e-05, "loss": 0.0986, "step": 3860 }, { "epoch": 0.9106418024589682, "grad_norm": 0.5756918787956238, "learning_rate": 2.6759237467639444e-05, "loss": 0.0897, "step": 3870 }, { "epoch": 0.9129948820518854, "grad_norm": 0.5781024098396301, "learning_rate": 2.605318898564368e-05, "loss": 0.0799, "step": 3880 }, { "epoch": 0.9153479616448026, "grad_norm": 0.28270334005355835, "learning_rate": 2.5347140503647918e-05, "loss": 0.0889, "step": 3890 }, { "epoch": 0.9177010412377199, "grad_norm": 0.5788043737411499, "learning_rate": 2.464109202165215e-05, "loss": 0.0788, "step": 3900 }, { "epoch": 0.9200541208306371, "grad_norm": 0.563836932182312, "learning_rate": 2.3935043539656384e-05, "loss": 0.0914, "step": 3910 }, { "epoch": 0.9224072004235543, "grad_norm": 0.4077290892601013, "learning_rate": 2.3228995057660623e-05, "loss": 0.1057, "step": 3920 }, { "epoch": 0.9247602800164716, "grad_norm": 0.6209468841552734, "learning_rate": 2.2522946575664858e-05, "loss": 0.0812, "step": 3930 }, { "epoch": 0.9271133596093888, "grad_norm": 0.542506754398346, "learning_rate": 2.1816898093669097e-05, "loss": 0.0951, "step": 3940 }, { "epoch": 0.929466439202306, "grad_norm": 0.5754973292350769, "learning_rate": 2.1110849611673332e-05, "loss": 0.1023, "step": 3950 }, { "epoch": 0.9318195187952233, "grad_norm": 0.3798030912876129, "learning_rate": 2.0404801129677567e-05, "loss": 0.0997, "step": 3960 }, { "epoch": 0.9341725983881405, "grad_norm": 0.6593634486198425, "learning_rate": 1.9698752647681806e-05, "loss": 0.1056, "step": 3970 }, { "epoch": 0.9365256779810577, "grad_norm": 0.46481505036354065, "learning_rate": 1.899270416568604e-05, "loss": 0.0795, "step": 3980 }, { "epoch": 0.938878757573975, "grad_norm": 0.5140686631202698, "learning_rate": 1.828665568369028e-05, "loss": 0.0809, "step": 3990 }, { "epoch": 0.9412318371668922, "grad_norm": 0.8201892375946045, "learning_rate": 1.7580607201694515e-05, "loss": 0.0851, "step": 4000 }, { "epoch": 0.9435849167598094, "grad_norm": 0.3848946690559387, "learning_rate": 1.687455871969875e-05, "loss": 0.0789, "step": 4010 }, { "epoch": 0.9459379963527267, "grad_norm": 0.3362932503223419, "learning_rate": 1.6168510237702988e-05, "loss": 0.0841, "step": 4020 }, { "epoch": 0.9482910759456439, "grad_norm": 0.400037556886673, "learning_rate": 1.5462461755707223e-05, "loss": 0.1004, "step": 4030 }, { "epoch": 0.9506441555385611, "grad_norm": 0.6505069136619568, "learning_rate": 1.475641327371146e-05, "loss": 0.0977, "step": 4040 }, { "epoch": 0.9529972351314783, "grad_norm": 0.710784375667572, "learning_rate": 1.4050364791715697e-05, "loss": 0.085, "step": 4050 }, { "epoch": 0.9553503147243956, "grad_norm": 0.4263714849948883, "learning_rate": 1.3344316309719934e-05, "loss": 0.1044, "step": 4060 }, { "epoch": 0.9577033943173128, "grad_norm": 0.42400240898132324, "learning_rate": 1.2638267827724171e-05, "loss": 0.1113, "step": 4070 }, { "epoch": 0.96005647391023, "grad_norm": 0.2722209393978119, "learning_rate": 1.1932219345728404e-05, "loss": 0.0792, "step": 4080 }, { "epoch": 0.9624095535031473, "grad_norm": 0.9779515862464905, "learning_rate": 1.1226170863732641e-05, "loss": 0.1044, "step": 4090 }, { "epoch": 0.9647626330960645, "grad_norm": 1.028387188911438, "learning_rate": 1.0520122381736878e-05, "loss": 0.1043, "step": 4100 }, { "epoch": 0.9671157126889817, "grad_norm": 0.5009176135063171, "learning_rate": 9.814073899741115e-06, "loss": 0.087, "step": 4110 }, { "epoch": 0.969468792281899, "grad_norm": 0.33020302653312683, "learning_rate": 9.10802541774535e-06, "loss": 0.092, "step": 4120 }, { "epoch": 0.9718218718748162, "grad_norm": 0.4314991533756256, "learning_rate": 8.401976935749587e-06, "loss": 0.075, "step": 4130 }, { "epoch": 0.9741749514677334, "grad_norm": 0.6121822595596313, "learning_rate": 7.695928453753824e-06, "loss": 0.0892, "step": 4140 }, { "epoch": 0.9765280310606507, "grad_norm": 0.3374115824699402, "learning_rate": 6.989879971758061e-06, "loss": 0.0755, "step": 4150 }, { "epoch": 0.9788811106535679, "grad_norm": 0.5865825414657593, "learning_rate": 6.283831489762297e-06, "loss": 0.0957, "step": 4160 }, { "epoch": 0.9812341902464851, "grad_norm": 0.2131696194410324, "learning_rate": 5.577783007766533e-06, "loss": 0.0849, "step": 4170 }, { "epoch": 0.9835872698394024, "grad_norm": 1.3489303588867188, "learning_rate": 4.871734525770769e-06, "loss": 0.098, "step": 4180 }, { "epoch": 0.9859403494323196, "grad_norm": 0.15470068156719208, "learning_rate": 4.1656860437750056e-06, "loss": 0.0847, "step": 4190 }, { "epoch": 0.9882934290252368, "grad_norm": 0.8059414625167847, "learning_rate": 3.459637561779242e-06, "loss": 0.1094, "step": 4200 }, { "epoch": 0.990646508618154, "grad_norm": 0.6808902621269226, "learning_rate": 2.753589079783478e-06, "loss": 0.0867, "step": 4210 }, { "epoch": 0.9929995882110713, "grad_norm": 0.29802441596984863, "learning_rate": 2.0475405977877145e-06, "loss": 0.0866, "step": 4220 }, { "epoch": 0.9953526678039885, "grad_norm": 0.5227815508842468, "learning_rate": 1.341492115791951e-06, "loss": 0.0758, "step": 4230 }, { "epoch": 0.9977057473969057, "grad_norm": 0.25185248255729675, "learning_rate": 6.354436337961872e-07, "loss": 0.0906, "step": 4240 }, { "epoch": 0.9998235190305312, "step": 4249, "total_flos": 1.037665224400896e+16, "train_loss": 0.1519955188758403, "train_runtime": 5739.131, "train_samples_per_second": 11.848, "train_steps_per_second": 0.74 } ], "logging_steps": 10, "max_steps": 4249, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 1.037665224400896e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }