|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998235190305312, |
|
"eval_steps": 500, |
|
"global_step": 4249, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0023530795929172306, |
|
"grad_norm": 4.500667572021484, |
|
"learning_rate": 0.0002992939515180042, |
|
"loss": 2.9988, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004706159185834461, |
|
"grad_norm": 2.7827706336975098, |
|
"learning_rate": 0.00029858790303600844, |
|
"loss": 1.0643, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007059238778751691, |
|
"grad_norm": 2.746577739715576, |
|
"learning_rate": 0.0002978818545540127, |
|
"loss": 0.7978, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009412318371668922, |
|
"grad_norm": 2.567692279815674, |
|
"learning_rate": 0.0002971758060720169, |
|
"loss": 0.6546, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011765397964586153, |
|
"grad_norm": 2.2394003868103027, |
|
"learning_rate": 0.00029646975759002115, |
|
"loss": 0.5361, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014118477557503383, |
|
"grad_norm": 2.1666100025177, |
|
"learning_rate": 0.0002957637091080254, |
|
"loss": 0.5137, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01647155715042061, |
|
"grad_norm": 1.922058343887329, |
|
"learning_rate": 0.0002950576606260296, |
|
"loss": 0.446, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.018824636743337845, |
|
"grad_norm": 1.732611060142517, |
|
"learning_rate": 0.00029435161214403386, |
|
"loss": 0.4405, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.021177716336255075, |
|
"grad_norm": 2.2046239376068115, |
|
"learning_rate": 0.0002936455636620381, |
|
"loss": 0.3959, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.023530795929172305, |
|
"grad_norm": 2.083113670349121, |
|
"learning_rate": 0.0002929395151800423, |
|
"loss": 0.4114, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.025883875522089535, |
|
"grad_norm": 1.671247124671936, |
|
"learning_rate": 0.00029223346669804656, |
|
"loss": 0.3554, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.028236955115006766, |
|
"grad_norm": 2.001924514770508, |
|
"learning_rate": 0.0002915274182160508, |
|
"loss": 0.3577, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.030590034707923996, |
|
"grad_norm": 2.07259202003479, |
|
"learning_rate": 0.00029082136973405503, |
|
"loss": 0.3422, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03294311430084122, |
|
"grad_norm": 1.7239247560501099, |
|
"learning_rate": 0.00029011532125205927, |
|
"loss": 0.3079, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03529619389375846, |
|
"grad_norm": 1.7430157661437988, |
|
"learning_rate": 0.0002894092727700635, |
|
"loss": 0.3304, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03764927348667569, |
|
"grad_norm": 1.1152617931365967, |
|
"learning_rate": 0.00028870322428806774, |
|
"loss": 0.3009, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04000235307959292, |
|
"grad_norm": 1.5272759199142456, |
|
"learning_rate": 0.00028799717580607197, |
|
"loss": 0.3027, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04235543267251015, |
|
"grad_norm": 1.3934285640716553, |
|
"learning_rate": 0.0002872911273240762, |
|
"loss": 0.2514, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04470851226542738, |
|
"grad_norm": 1.7138372659683228, |
|
"learning_rate": 0.00028658507884208044, |
|
"loss": 0.2556, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04706159185834461, |
|
"grad_norm": 1.7979109287261963, |
|
"learning_rate": 0.00028587903036008473, |
|
"loss": 0.2696, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04941467145126184, |
|
"grad_norm": 1.342785358428955, |
|
"learning_rate": 0.0002851729818780889, |
|
"loss": 0.2496, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05176775104417907, |
|
"grad_norm": 1.5516395568847656, |
|
"learning_rate": 0.0002844669333960932, |
|
"loss": 0.2727, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0541208306370963, |
|
"grad_norm": 6.922358989715576, |
|
"learning_rate": 0.0002837608849140974, |
|
"loss": 0.2492, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.05647391023001353, |
|
"grad_norm": 1.5551228523254395, |
|
"learning_rate": 0.00028305483643210167, |
|
"loss": 0.2451, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.05882698982293076, |
|
"grad_norm": 1.300445318222046, |
|
"learning_rate": 0.00028234878795010585, |
|
"loss": 0.253, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06118006941584799, |
|
"grad_norm": 1.4355467557907104, |
|
"learning_rate": 0.00028164273946811014, |
|
"loss": 0.2453, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06353314900876522, |
|
"grad_norm": 15.9704008102417, |
|
"learning_rate": 0.0002809366909861143, |
|
"loss": 0.2072, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06588622860168244, |
|
"grad_norm": 1.7124171257019043, |
|
"learning_rate": 0.0002802306425041186, |
|
"loss": 0.4367, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06823930819459968, |
|
"grad_norm": 1.6787582635879517, |
|
"learning_rate": 0.0002795245940221228, |
|
"loss": 0.2716, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07059238778751692, |
|
"grad_norm": 1.2618638277053833, |
|
"learning_rate": 0.0002788185455401271, |
|
"loss": 0.2821, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07294546738043414, |
|
"grad_norm": 2.927347421646118, |
|
"learning_rate": 0.00027811249705813126, |
|
"loss": 0.2712, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.07529854697335138, |
|
"grad_norm": 1.9304898977279663, |
|
"learning_rate": 0.00027740644857613555, |
|
"loss": 0.2637, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0776516265662686, |
|
"grad_norm": 1.2599807977676392, |
|
"learning_rate": 0.0002767004000941398, |
|
"loss": 0.2389, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08000470615918584, |
|
"grad_norm": 1.4264953136444092, |
|
"learning_rate": 0.000275994351612144, |
|
"loss": 0.2143, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08235778575210306, |
|
"grad_norm": 1.4229093790054321, |
|
"learning_rate": 0.00027528830313014826, |
|
"loss": 0.2571, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0847108653450203, |
|
"grad_norm": 1.743034839630127, |
|
"learning_rate": 0.0002745822546481525, |
|
"loss": 0.2292, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.08706394493793752, |
|
"grad_norm": 1.3582898378372192, |
|
"learning_rate": 0.00027387620616615673, |
|
"loss": 0.2314, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.08941702453085476, |
|
"grad_norm": 1.2714539766311646, |
|
"learning_rate": 0.00027317015768416096, |
|
"loss": 0.2694, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09177010412377198, |
|
"grad_norm": 1.0213568210601807, |
|
"learning_rate": 0.0002724641092021652, |
|
"loss": 0.2269, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.09412318371668922, |
|
"grad_norm": 0.8783596754074097, |
|
"learning_rate": 0.00027175806072016943, |
|
"loss": 0.2488, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09647626330960644, |
|
"grad_norm": 1.842328667640686, |
|
"learning_rate": 0.00027105201223817367, |
|
"loss": 0.2175, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.09882934290252368, |
|
"grad_norm": 1.4185247421264648, |
|
"learning_rate": 0.0002703459637561779, |
|
"loss": 0.2049, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1011824224954409, |
|
"grad_norm": 1.3057924509048462, |
|
"learning_rate": 0.00026963991527418214, |
|
"loss": 0.1819, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.10353550208835814, |
|
"grad_norm": 1.563916802406311, |
|
"learning_rate": 0.0002689338667921864, |
|
"loss": 0.2042, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.10588858168127536, |
|
"grad_norm": 0.9588648080825806, |
|
"learning_rate": 0.0002682278183101906, |
|
"loss": 0.1977, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1082416612741926, |
|
"grad_norm": 1.3258203268051147, |
|
"learning_rate": 0.00026752176982819485, |
|
"loss": 0.1984, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.11059474086710983, |
|
"grad_norm": 1.6783477067947388, |
|
"learning_rate": 0.0002668157213461991, |
|
"loss": 0.2086, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.11294782046002706, |
|
"grad_norm": 1.820469617843628, |
|
"learning_rate": 0.0002661096728642033, |
|
"loss": 0.2128, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.11530090005294429, |
|
"grad_norm": 1.1493395566940308, |
|
"learning_rate": 0.00026540362438220755, |
|
"loss": 0.2022, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.11765397964586152, |
|
"grad_norm": 1.3134245872497559, |
|
"learning_rate": 0.0002646975759002118, |
|
"loss": 0.2199, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12000705923877875, |
|
"grad_norm": 4.0345988273620605, |
|
"learning_rate": 0.000263991527418216, |
|
"loss": 0.2181, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.12236013883169598, |
|
"grad_norm": 1.7995352745056152, |
|
"learning_rate": 0.00026328547893622026, |
|
"loss": 0.2233, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1247132184246132, |
|
"grad_norm": 1.6303889751434326, |
|
"learning_rate": 0.0002625794304542245, |
|
"loss": 0.2161, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.12706629801753044, |
|
"grad_norm": 1.1785417795181274, |
|
"learning_rate": 0.0002618733819722287, |
|
"loss": 0.1952, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.12941937761044767, |
|
"grad_norm": 2.5084645748138428, |
|
"learning_rate": 0.00026116733349023296, |
|
"loss": 0.204, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1317724572033649, |
|
"grad_norm": 0.9784884452819824, |
|
"learning_rate": 0.0002604612850082372, |
|
"loss": 0.1598, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.13412553679628214, |
|
"grad_norm": 0.8020937442779541, |
|
"learning_rate": 0.00025975523652624143, |
|
"loss": 0.2045, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.13647861638919936, |
|
"grad_norm": 0.9151997566223145, |
|
"learning_rate": 0.00025904918804424567, |
|
"loss": 0.1744, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1388316959821166, |
|
"grad_norm": 1.55955171585083, |
|
"learning_rate": 0.0002583431395622499, |
|
"loss": 0.1878, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.14118477557503384, |
|
"grad_norm": 23.52423858642578, |
|
"learning_rate": 0.0002576370910802542, |
|
"loss": 0.1779, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.14353785516795106, |
|
"grad_norm": 1.1516189575195312, |
|
"learning_rate": 0.0002569310425982584, |
|
"loss": 0.195, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.14589093476086828, |
|
"grad_norm": 1.0912541151046753, |
|
"learning_rate": 0.00025622499411626266, |
|
"loss": 0.205, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1482440143537855, |
|
"grad_norm": 1.2680310010910034, |
|
"learning_rate": 0.00025551894563426684, |
|
"loss": 0.1654, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.15059709394670276, |
|
"grad_norm": 1.2099932432174683, |
|
"learning_rate": 0.00025481289715227113, |
|
"loss": 0.1698, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.15295017353961998, |
|
"grad_norm": 1.1155511140823364, |
|
"learning_rate": 0.0002541068486702753, |
|
"loss": 0.1883, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1553032531325372, |
|
"grad_norm": 1.2237110137939453, |
|
"learning_rate": 0.0002534008001882796, |
|
"loss": 0.1739, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.15765633272545443, |
|
"grad_norm": 2.2334392070770264, |
|
"learning_rate": 0.0002526947517062838, |
|
"loss": 0.2149, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.16000941231837168, |
|
"grad_norm": 1.0051536560058594, |
|
"learning_rate": 0.0002519887032242881, |
|
"loss": 0.1755, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.1623624919112889, |
|
"grad_norm": 1.5381518602371216, |
|
"learning_rate": 0.00025128265474229225, |
|
"loss": 0.1814, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.16471557150420613, |
|
"grad_norm": 1.3390990495681763, |
|
"learning_rate": 0.00025057660626029654, |
|
"loss": 0.1866, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.16706865109712335, |
|
"grad_norm": 1.4517531394958496, |
|
"learning_rate": 0.0002498705577783007, |
|
"loss": 0.1928, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.1694217306900406, |
|
"grad_norm": 1.4081028699874878, |
|
"learning_rate": 0.000249164509296305, |
|
"loss": 0.1714, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.17177481028295782, |
|
"grad_norm": 1.357934832572937, |
|
"learning_rate": 0.00024845846081430925, |
|
"loss": 0.1856, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.17412788987587505, |
|
"grad_norm": 1.043090581893921, |
|
"learning_rate": 0.0002477524123323135, |
|
"loss": 0.1495, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.17648096946879227, |
|
"grad_norm": 1.2053163051605225, |
|
"learning_rate": 0.0002470463638503177, |
|
"loss": 0.1517, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.17883404906170952, |
|
"grad_norm": 2.3474409580230713, |
|
"learning_rate": 0.00024634031536832195, |
|
"loss": 0.1882, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.18118712865462674, |
|
"grad_norm": 0.8380926847457886, |
|
"learning_rate": 0.0002456342668863262, |
|
"loss": 0.1796, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.18354020824754397, |
|
"grad_norm": 1.3997254371643066, |
|
"learning_rate": 0.0002449282184043304, |
|
"loss": 0.1843, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.1858932878404612, |
|
"grad_norm": 1.3143609762191772, |
|
"learning_rate": 0.00024422216992233466, |
|
"loss": 0.1629, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.18824636743337844, |
|
"grad_norm": 0.9414114952087402, |
|
"learning_rate": 0.0002435161214403389, |
|
"loss": 0.1583, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.19059944702629567, |
|
"grad_norm": 1.0523838996887207, |
|
"learning_rate": 0.0002428100729583431, |
|
"loss": 0.194, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.1929525266192129, |
|
"grad_norm": 1.0871750116348267, |
|
"learning_rate": 0.00024210402447634737, |
|
"loss": 0.15, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.1953056062121301, |
|
"grad_norm": 1.515932321548462, |
|
"learning_rate": 0.0002413979759943516, |
|
"loss": 0.1895, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.19765868580504736, |
|
"grad_norm": 0.7211456298828125, |
|
"learning_rate": 0.00024069192751235584, |
|
"loss": 0.1685, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.20001176539796459, |
|
"grad_norm": 0.7664592862129211, |
|
"learning_rate": 0.00023998587903036007, |
|
"loss": 0.1658, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2023648449908818, |
|
"grad_norm": 0.8728657960891724, |
|
"learning_rate": 0.0002392798305483643, |
|
"loss": 0.1468, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.20471792458379906, |
|
"grad_norm": 1.3027325868606567, |
|
"learning_rate": 0.00023857378206636854, |
|
"loss": 0.1633, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.20707100417671628, |
|
"grad_norm": 1.1061084270477295, |
|
"learning_rate": 0.00023786773358437278, |
|
"loss": 0.1861, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2094240837696335, |
|
"grad_norm": 1.176365613937378, |
|
"learning_rate": 0.000237161685102377, |
|
"loss": 0.1589, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.21177716336255073, |
|
"grad_norm": 0.8307468295097351, |
|
"learning_rate": 0.00023645563662038127, |
|
"loss": 0.172, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.21413024295546798, |
|
"grad_norm": 1.2759816646575928, |
|
"learning_rate": 0.00023574958813838548, |
|
"loss": 0.1475, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2164833225483852, |
|
"grad_norm": 1.661071538925171, |
|
"learning_rate": 0.00023504353965638974, |
|
"loss": 0.2048, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.21883640214130243, |
|
"grad_norm": 1.3144210577011108, |
|
"learning_rate": 0.00023433749117439395, |
|
"loss": 0.1582, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.22118948173421965, |
|
"grad_norm": 1.1830146312713623, |
|
"learning_rate": 0.00023363144269239821, |
|
"loss": 0.1567, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2235425613271369, |
|
"grad_norm": 0.7755473256111145, |
|
"learning_rate": 0.00023292539421040242, |
|
"loss": 0.1369, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.22589564092005412, |
|
"grad_norm": 0.708152711391449, |
|
"learning_rate": 0.00023221934572840666, |
|
"loss": 0.1477, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.22824872051297135, |
|
"grad_norm": 0.9567592144012451, |
|
"learning_rate": 0.0002315132972464109, |
|
"loss": 0.1685, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.23060180010588857, |
|
"grad_norm": 1.019717812538147, |
|
"learning_rate": 0.00023080724876441513, |
|
"loss": 0.1485, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.23295487969880582, |
|
"grad_norm": 3.704050302505493, |
|
"learning_rate": 0.00023010120028241936, |
|
"loss": 0.182, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.23530795929172305, |
|
"grad_norm": 1.3113001585006714, |
|
"learning_rate": 0.0002293951518004236, |
|
"loss": 0.2033, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.23766103888464027, |
|
"grad_norm": 1.586300253868103, |
|
"learning_rate": 0.00022868910331842783, |
|
"loss": 0.1658, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.2400141184775575, |
|
"grad_norm": 0.7029755711555481, |
|
"learning_rate": 0.00022798305483643207, |
|
"loss": 0.1675, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.24236719807047474, |
|
"grad_norm": 0.9558175802230835, |
|
"learning_rate": 0.00022727700635443633, |
|
"loss": 0.1292, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.24472027766339197, |
|
"grad_norm": 0.552598774433136, |
|
"learning_rate": 0.00022657095787244054, |
|
"loss": 0.1271, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2470733572563092, |
|
"grad_norm": 1.160657525062561, |
|
"learning_rate": 0.0002258649093904448, |
|
"loss": 0.1406, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2494264368492264, |
|
"grad_norm": 0.9359754323959351, |
|
"learning_rate": 0.000225158860908449, |
|
"loss": 0.1456, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.25177951644214364, |
|
"grad_norm": 0.6799198985099792, |
|
"learning_rate": 0.00022445281242645327, |
|
"loss": 0.1235, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.2541325960350609, |
|
"grad_norm": 0.97700434923172, |
|
"learning_rate": 0.00022374676394445748, |
|
"loss": 0.1721, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.25648567562797814, |
|
"grad_norm": 0.6762118935585022, |
|
"learning_rate": 0.00022304071546246174, |
|
"loss": 0.2006, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.25883875522089533, |
|
"grad_norm": 0.6071228384971619, |
|
"learning_rate": 0.00022233466698046595, |
|
"loss": 0.1647, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2611918348138126, |
|
"grad_norm": 0.7097590565681458, |
|
"learning_rate": 0.0002216286184984702, |
|
"loss": 0.1662, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.2635449144067298, |
|
"grad_norm": 0.48786184191703796, |
|
"learning_rate": 0.00022092257001647442, |
|
"loss": 0.1378, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.26589799399964703, |
|
"grad_norm": 0.7238913178443909, |
|
"learning_rate": 0.00022021652153447868, |
|
"loss": 0.163, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.2682510735925643, |
|
"grad_norm": 1.3571726083755493, |
|
"learning_rate": 0.00021951047305248292, |
|
"loss": 0.1545, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2706041531854815, |
|
"grad_norm": 0.6683372259140015, |
|
"learning_rate": 0.00021880442457048715, |
|
"loss": 0.1375, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.27295723277839873, |
|
"grad_norm": 1.9159690141677856, |
|
"learning_rate": 0.0002180983760884914, |
|
"loss": 0.1604, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.275310312371316, |
|
"grad_norm": 1.6136759519577026, |
|
"learning_rate": 0.00021739232760649562, |
|
"loss": 0.1827, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.2776633919642332, |
|
"grad_norm": 1.2445416450500488, |
|
"learning_rate": 0.00021668627912449986, |
|
"loss": 0.1283, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.2800164715571504, |
|
"grad_norm": 1.143410563468933, |
|
"learning_rate": 0.0002159802306425041, |
|
"loss": 0.1571, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.2823695511500677, |
|
"grad_norm": 0.641952633857727, |
|
"learning_rate": 0.00021527418216050833, |
|
"loss": 0.1511, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2847226307429849, |
|
"grad_norm": 0.9618122577667236, |
|
"learning_rate": 0.00021456813367851256, |
|
"loss": 0.1251, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.2870757103359021, |
|
"grad_norm": 1.040390133857727, |
|
"learning_rate": 0.0002138620851965168, |
|
"loss": 0.1481, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.2894287899288193, |
|
"grad_norm": 2.470360279083252, |
|
"learning_rate": 0.00021315603671452106, |
|
"loss": 0.1523, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.29178186952173657, |
|
"grad_norm": 1.15378737449646, |
|
"learning_rate": 0.00021244998823252527, |
|
"loss": 0.1526, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.2941349491146538, |
|
"grad_norm": 1.2236779928207397, |
|
"learning_rate": 0.00021174393975052953, |
|
"loss": 0.15, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.296488028707571, |
|
"grad_norm": 0.6974225640296936, |
|
"learning_rate": 0.00021103789126853374, |
|
"loss": 0.1529, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.29884110830048827, |
|
"grad_norm": 1.2019627094268799, |
|
"learning_rate": 0.000210331842786538, |
|
"loss": 0.1534, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.3011941878934055, |
|
"grad_norm": 1.5245829820632935, |
|
"learning_rate": 0.0002096257943045422, |
|
"loss": 0.1452, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3035472674863227, |
|
"grad_norm": 1.5062931776046753, |
|
"learning_rate": 0.00020891974582254647, |
|
"loss": 0.1617, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.30590034707923996, |
|
"grad_norm": 0.5989176034927368, |
|
"learning_rate": 0.00020821369734055068, |
|
"loss": 0.1567, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.30825342667215716, |
|
"grad_norm": 1.1063286066055298, |
|
"learning_rate": 0.00020750764885855494, |
|
"loss": 0.1651, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.3106065062650744, |
|
"grad_norm": 0.9815717935562134, |
|
"learning_rate": 0.00020680160037655915, |
|
"loss": 0.1485, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.31295958585799166, |
|
"grad_norm": 1.218807578086853, |
|
"learning_rate": 0.0002060955518945634, |
|
"loss": 0.1151, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.31531266545090886, |
|
"grad_norm": 1.1629014015197754, |
|
"learning_rate": 0.00020538950341256765, |
|
"loss": 0.1406, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.3176657450438261, |
|
"grad_norm": 0.6818956732749939, |
|
"learning_rate": 0.00020468345493057188, |
|
"loss": 0.1465, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.32001882463674336, |
|
"grad_norm": 0.7869308590888977, |
|
"learning_rate": 0.00020397740644857612, |
|
"loss": 0.1515, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.32237190422966056, |
|
"grad_norm": 1.023478627204895, |
|
"learning_rate": 0.00020327135796658035, |
|
"loss": 0.1781, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.3247249838225778, |
|
"grad_norm": 1.0383384227752686, |
|
"learning_rate": 0.0002025653094845846, |
|
"loss": 0.1195, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.327078063415495, |
|
"grad_norm": 1.5291595458984375, |
|
"learning_rate": 0.00020185926100258882, |
|
"loss": 0.1334, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.32943114300841225, |
|
"grad_norm": 0.9488996267318726, |
|
"learning_rate": 0.00020115321252059306, |
|
"loss": 0.1368, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3317842226013295, |
|
"grad_norm": 1.1703331470489502, |
|
"learning_rate": 0.0002004471640385973, |
|
"loss": 0.131, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.3341373021942467, |
|
"grad_norm": 0.6122885346412659, |
|
"learning_rate": 0.00019974111555660153, |
|
"loss": 0.1356, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.33649038178716395, |
|
"grad_norm": 0.7869921326637268, |
|
"learning_rate": 0.0001990350670746058, |
|
"loss": 0.1817, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.3388434613800812, |
|
"grad_norm": 0.691066324710846, |
|
"learning_rate": 0.00019832901859261, |
|
"loss": 0.131, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.3411965409729984, |
|
"grad_norm": 1.4205127954483032, |
|
"learning_rate": 0.00019762297011061426, |
|
"loss": 0.1366, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.34354962056591565, |
|
"grad_norm": 0.47127053141593933, |
|
"learning_rate": 0.00019691692162861847, |
|
"loss": 0.1498, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.3459027001588329, |
|
"grad_norm": 0.9336820840835571, |
|
"learning_rate": 0.00019621087314662273, |
|
"loss": 0.1512, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.3482557797517501, |
|
"grad_norm": 0.8124200105667114, |
|
"learning_rate": 0.00019550482466462694, |
|
"loss": 0.1319, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.35060885934466735, |
|
"grad_norm": 0.6921178698539734, |
|
"learning_rate": 0.0001947987761826312, |
|
"loss": 0.1279, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.35296193893758454, |
|
"grad_norm": 1.336229681968689, |
|
"learning_rate": 0.0001940927277006354, |
|
"loss": 0.1251, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3553150185305018, |
|
"grad_norm": 0.9984803795814514, |
|
"learning_rate": 0.00019338667921863967, |
|
"loss": 0.1299, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.35766809812341904, |
|
"grad_norm": 1.0903042554855347, |
|
"learning_rate": 0.00019268063073664388, |
|
"loss": 0.1528, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.36002117771633624, |
|
"grad_norm": 0.666950523853302, |
|
"learning_rate": 0.00019197458225464814, |
|
"loss": 0.1446, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.3623742573092535, |
|
"grad_norm": 0.8104845285415649, |
|
"learning_rate": 0.00019126853377265238, |
|
"loss": 0.1221, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.36472733690217074, |
|
"grad_norm": 0.5904582738876343, |
|
"learning_rate": 0.00019056248529065661, |
|
"loss": 0.1164, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.36708041649508794, |
|
"grad_norm": 0.7703972458839417, |
|
"learning_rate": 0.00018985643680866085, |
|
"loss": 0.0978, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.3694334960880052, |
|
"grad_norm": 1.9245415925979614, |
|
"learning_rate": 0.00018915038832666508, |
|
"loss": 0.1624, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.3717865756809224, |
|
"grad_norm": 1.6459194421768188, |
|
"learning_rate": 0.00018844433984466932, |
|
"loss": 0.1289, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.37413965527383963, |
|
"grad_norm": 1.6774044036865234, |
|
"learning_rate": 0.00018773829136267355, |
|
"loss": 0.1468, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.3764927348667569, |
|
"grad_norm": 1.5878580808639526, |
|
"learning_rate": 0.0001870322428806778, |
|
"loss": 0.1318, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3788458144596741, |
|
"grad_norm": 0.7039738297462463, |
|
"learning_rate": 0.00018632619439868203, |
|
"loss": 0.1242, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.38119889405259133, |
|
"grad_norm": 1.1770200729370117, |
|
"learning_rate": 0.00018562014591668626, |
|
"loss": 0.1321, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.3835519736455086, |
|
"grad_norm": 2.2201638221740723, |
|
"learning_rate": 0.00018491409743469052, |
|
"loss": 0.1214, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.3859050532384258, |
|
"grad_norm": 0.756149411201477, |
|
"learning_rate": 0.00018420804895269473, |
|
"loss": 0.1219, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.38825813283134303, |
|
"grad_norm": 0.5444088578224182, |
|
"learning_rate": 0.000183502000470699, |
|
"loss": 0.1346, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.3906112124242602, |
|
"grad_norm": 0.7643070816993713, |
|
"learning_rate": 0.0001827959519887032, |
|
"loss": 0.1324, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.3929642920171775, |
|
"grad_norm": 0.885362446308136, |
|
"learning_rate": 0.00018208990350670746, |
|
"loss": 0.1166, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.3953173716100947, |
|
"grad_norm": 0.7135679721832275, |
|
"learning_rate": 0.00018138385502471167, |
|
"loss": 0.1364, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.3976704512030119, |
|
"grad_norm": 0.5533025860786438, |
|
"learning_rate": 0.00018067780654271593, |
|
"loss": 0.1137, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.40002353079592917, |
|
"grad_norm": 0.5916281342506409, |
|
"learning_rate": 0.00017997175806072014, |
|
"loss": 0.1131, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4023766103888464, |
|
"grad_norm": 0.8299354314804077, |
|
"learning_rate": 0.0001792657095787244, |
|
"loss": 0.1331, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.4047296899817636, |
|
"grad_norm": 0.7944399118423462, |
|
"learning_rate": 0.0001785596610967286, |
|
"loss": 0.1049, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.40708276957468087, |
|
"grad_norm": 0.6967952251434326, |
|
"learning_rate": 0.00017785361261473287, |
|
"loss": 0.0997, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.4094358491675981, |
|
"grad_norm": 0.42431318759918213, |
|
"learning_rate": 0.0001771475641327371, |
|
"loss": 0.0964, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.4117889287605153, |
|
"grad_norm": 0.6767364740371704, |
|
"learning_rate": 0.00017644151565074134, |
|
"loss": 0.1627, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.41414200835343257, |
|
"grad_norm": 1.0430301427841187, |
|
"learning_rate": 0.00017573546716874558, |
|
"loss": 0.1173, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.41649508794634976, |
|
"grad_norm": 0.6168161034584045, |
|
"learning_rate": 0.00017502941868674981, |
|
"loss": 0.1229, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 1.9067519903182983, |
|
"learning_rate": 0.00017432337020475405, |
|
"loss": 0.1369, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.42120124713218426, |
|
"grad_norm": 1.5157831907272339, |
|
"learning_rate": 0.00017361732172275829, |
|
"loss": 0.1243, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.42355432672510146, |
|
"grad_norm": 1.5152102708816528, |
|
"learning_rate": 0.00017291127324076252, |
|
"loss": 0.1395, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4259074063180187, |
|
"grad_norm": 0.8262742161750793, |
|
"learning_rate": 0.00017220522475876676, |
|
"loss": 0.1467, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.42826048591093596, |
|
"grad_norm": 0.5484256744384766, |
|
"learning_rate": 0.000171499176276771, |
|
"loss": 0.1405, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.43061356550385316, |
|
"grad_norm": 0.7796267867088318, |
|
"learning_rate": 0.00017079312779477525, |
|
"loss": 0.1508, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.4329666450967704, |
|
"grad_norm": 0.7360082268714905, |
|
"learning_rate": 0.00017008707931277946, |
|
"loss": 0.1332, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.4353197246896876, |
|
"grad_norm": 0.8352281451225281, |
|
"learning_rate": 0.00016938103083078372, |
|
"loss": 0.1343, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.43767280428260485, |
|
"grad_norm": 0.6898388862609863, |
|
"learning_rate": 0.00016867498234878793, |
|
"loss": 0.0983, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.4400258838755221, |
|
"grad_norm": 0.3843238651752472, |
|
"learning_rate": 0.0001679689338667922, |
|
"loss": 0.1091, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.4423789634684393, |
|
"grad_norm": 0.7791532278060913, |
|
"learning_rate": 0.0001672628853847964, |
|
"loss": 0.1321, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.44473204306135655, |
|
"grad_norm": 0.9906323552131653, |
|
"learning_rate": 0.00016655683690280064, |
|
"loss": 0.1125, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.4470851226542738, |
|
"grad_norm": 0.631594181060791, |
|
"learning_rate": 0.00016585078842080487, |
|
"loss": 0.1328, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 1.4922380447387695, |
|
"learning_rate": 0.0001651447399388091, |
|
"loss": 0.1441, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.45179128184010825, |
|
"grad_norm": 0.6896445751190186, |
|
"learning_rate": 0.00016443869145681334, |
|
"loss": 0.146, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.45414436143302545, |
|
"grad_norm": 0.6470409035682678, |
|
"learning_rate": 0.00016373264297481758, |
|
"loss": 0.1123, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.4564974410259427, |
|
"grad_norm": 1.4532804489135742, |
|
"learning_rate": 0.00016302659449282184, |
|
"loss": 0.1204, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.45885052061885995, |
|
"grad_norm": 1.5582534074783325, |
|
"learning_rate": 0.00016232054601082605, |
|
"loss": 0.1275, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.46120360021177714, |
|
"grad_norm": 0.7568921446800232, |
|
"learning_rate": 0.0001616144975288303, |
|
"loss": 0.1373, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.4635566798046944, |
|
"grad_norm": 0.7904714941978455, |
|
"learning_rate": 0.00016090844904683452, |
|
"loss": 0.1281, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.46590975939761164, |
|
"grad_norm": 0.48104897141456604, |
|
"learning_rate": 0.00016020240056483878, |
|
"loss": 0.1173, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.46826283899052884, |
|
"grad_norm": 0.6676899194717407, |
|
"learning_rate": 0.000159496352082843, |
|
"loss": 0.1297, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.4706159185834461, |
|
"grad_norm": 0.7035501599311829, |
|
"learning_rate": 0.00015879030360084725, |
|
"loss": 0.1246, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.47296899817636334, |
|
"grad_norm": 1.289421796798706, |
|
"learning_rate": 0.00015808425511885146, |
|
"loss": 0.1501, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.47532207776928054, |
|
"grad_norm": 0.6186831593513489, |
|
"learning_rate": 0.00015737820663685572, |
|
"loss": 0.1156, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.4776751573621978, |
|
"grad_norm": 0.7897233963012695, |
|
"learning_rate": 0.00015667215815485993, |
|
"loss": 0.1317, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.480028236955115, |
|
"grad_norm": 1.1652599573135376, |
|
"learning_rate": 0.0001559661096728642, |
|
"loss": 0.1325, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.48238131654803224, |
|
"grad_norm": 0.6400769948959351, |
|
"learning_rate": 0.0001552600611908684, |
|
"loss": 0.1002, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.4847343961409495, |
|
"grad_norm": 0.5541133880615234, |
|
"learning_rate": 0.00015455401270887266, |
|
"loss": 0.1232, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.4870874757338667, |
|
"grad_norm": 0.605411946773529, |
|
"learning_rate": 0.0001538479642268769, |
|
"loss": 0.1102, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.48944055532678393, |
|
"grad_norm": 0.49058374762535095, |
|
"learning_rate": 0.00015314191574488113, |
|
"loss": 0.1228, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.4917936349197012, |
|
"grad_norm": 0.7565241456031799, |
|
"learning_rate": 0.00015243586726288537, |
|
"loss": 0.1009, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.4941467145126184, |
|
"grad_norm": 0.4517477750778198, |
|
"learning_rate": 0.0001517298187808896, |
|
"loss": 0.129, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.49649979410553563, |
|
"grad_norm": 0.7871853709220886, |
|
"learning_rate": 0.00015102377029889384, |
|
"loss": 0.1049, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.4988528736984528, |
|
"grad_norm": 0.4314168691635132, |
|
"learning_rate": 0.00015031772181689807, |
|
"loss": 0.117, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.5012059532913701, |
|
"grad_norm": 0.8347052335739136, |
|
"learning_rate": 0.0001496116733349023, |
|
"loss": 0.1336, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.5035590328842873, |
|
"grad_norm": 0.42039480805397034, |
|
"learning_rate": 0.00014890562485290657, |
|
"loss": 0.1176, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5059121124772046, |
|
"grad_norm": 1.1371684074401855, |
|
"learning_rate": 0.0001481995763709108, |
|
"loss": 0.1362, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5082651920701218, |
|
"grad_norm": 0.8690921664237976, |
|
"learning_rate": 0.00014749352788891504, |
|
"loss": 0.1444, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.510618271663039, |
|
"grad_norm": 0.3952578604221344, |
|
"learning_rate": 0.00014678747940691928, |
|
"loss": 0.1411, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.5129713512559563, |
|
"grad_norm": 1.0104624032974243, |
|
"learning_rate": 0.0001460814309249235, |
|
"loss": 0.1127, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.5153244308488735, |
|
"grad_norm": 0.7708560824394226, |
|
"learning_rate": 0.00014537538244292775, |
|
"loss": 0.1364, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.5176775104417907, |
|
"grad_norm": 3.323113203048706, |
|
"learning_rate": 0.00014466933396093198, |
|
"loss": 0.1326, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.520030590034708, |
|
"grad_norm": 0.5021951198577881, |
|
"learning_rate": 0.00014396328547893622, |
|
"loss": 0.1075, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.5223836696276252, |
|
"grad_norm": 0.5558544397354126, |
|
"learning_rate": 0.00014325723699694045, |
|
"loss": 0.1019, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5247367492205424, |
|
"grad_norm": 0.7476164102554321, |
|
"learning_rate": 0.0001425511885149447, |
|
"loss": 0.108, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.5270898288134596, |
|
"grad_norm": 0.8783542513847351, |
|
"learning_rate": 0.00014184514003294892, |
|
"loss": 0.1182, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5294429084063769, |
|
"grad_norm": 0.5716719627380371, |
|
"learning_rate": 0.00014113909155095316, |
|
"loss": 0.1048, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5317959879992941, |
|
"grad_norm": 0.41919055581092834, |
|
"learning_rate": 0.0001404330430689574, |
|
"loss": 0.1071, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.5341490675922113, |
|
"grad_norm": 0.672885537147522, |
|
"learning_rate": 0.00013972699458696163, |
|
"loss": 0.1333, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.5365021471851286, |
|
"grad_norm": 0.7414030432701111, |
|
"learning_rate": 0.00013902094610496586, |
|
"loss": 0.1288, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.5388552267780458, |
|
"grad_norm": 1.1601518392562866, |
|
"learning_rate": 0.0001383148976229701, |
|
"loss": 0.1099, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.541208306370963, |
|
"grad_norm": 0.4423375129699707, |
|
"learning_rate": 0.00013760884914097433, |
|
"loss": 0.1049, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5435613859638803, |
|
"grad_norm": 0.9248809218406677, |
|
"learning_rate": 0.00013690280065897857, |
|
"loss": 0.1172, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.5459144655567975, |
|
"grad_norm": 1.3502943515777588, |
|
"learning_rate": 0.0001361967521769828, |
|
"loss": 0.1303, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.5482675451497147, |
|
"grad_norm": 1.488297939300537, |
|
"learning_rate": 0.00013549070369498704, |
|
"loss": 0.1182, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.550620624742632, |
|
"grad_norm": 0.6636572480201721, |
|
"learning_rate": 0.0001347846552129913, |
|
"loss": 0.1233, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.5529737043355492, |
|
"grad_norm": 0.5864549279212952, |
|
"learning_rate": 0.00013407860673099554, |
|
"loss": 0.1102, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.5553267839284663, |
|
"grad_norm": 1.9224406480789185, |
|
"learning_rate": 0.00013337255824899977, |
|
"loss": 0.1449, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.5576798635213837, |
|
"grad_norm": 1.1239560842514038, |
|
"learning_rate": 0.00013266650976700398, |
|
"loss": 0.1155, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.5600329431143009, |
|
"grad_norm": 0.6336050629615784, |
|
"learning_rate": 0.00013196046128500821, |
|
"loss": 0.1193, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.562386022707218, |
|
"grad_norm": 0.9129360914230347, |
|
"learning_rate": 0.00013125441280301245, |
|
"loss": 0.1121, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.5647391023001354, |
|
"grad_norm": 0.6220555305480957, |
|
"learning_rate": 0.00013054836432101668, |
|
"loss": 0.1172, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5670921818930525, |
|
"grad_norm": 0.8981531262397766, |
|
"learning_rate": 0.00012984231583902092, |
|
"loss": 0.1184, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.5694452614859697, |
|
"grad_norm": 0.7610392570495605, |
|
"learning_rate": 0.00012913626735702515, |
|
"loss": 0.1204, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.5717983410788869, |
|
"grad_norm": 0.5133729577064514, |
|
"learning_rate": 0.0001284302188750294, |
|
"loss": 0.1081, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.5741514206718042, |
|
"grad_norm": 0.8097817897796631, |
|
"learning_rate": 0.00012772417039303363, |
|
"loss": 0.1142, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.5765045002647214, |
|
"grad_norm": 1.8712083101272583, |
|
"learning_rate": 0.00012701812191103786, |
|
"loss": 0.1234, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.5788575798576386, |
|
"grad_norm": 0.8425026535987854, |
|
"learning_rate": 0.00012631207342904212, |
|
"loss": 0.1027, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.5812106594505559, |
|
"grad_norm": 0.5562009811401367, |
|
"learning_rate": 0.00012560602494704636, |
|
"loss": 0.0916, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.5835637390434731, |
|
"grad_norm": 0.45057183504104614, |
|
"learning_rate": 0.0001248999764650506, |
|
"loss": 0.1166, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.5859168186363903, |
|
"grad_norm": 0.5411068797111511, |
|
"learning_rate": 0.00012419392798305483, |
|
"loss": 0.1254, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.5882698982293076, |
|
"grad_norm": 0.9400952458381653, |
|
"learning_rate": 0.00012348787950105906, |
|
"loss": 0.1411, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.5906229778222248, |
|
"grad_norm": 0.4275170564651489, |
|
"learning_rate": 0.0001227818310190633, |
|
"loss": 0.1089, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.592976057415142, |
|
"grad_norm": 1.2033214569091797, |
|
"learning_rate": 0.00012207578253706753, |
|
"loss": 0.108, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.5953291370080593, |
|
"grad_norm": 1.257379412651062, |
|
"learning_rate": 0.00012136973405507177, |
|
"loss": 0.1207, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.5976822166009765, |
|
"grad_norm": 0.7070032954216003, |
|
"learning_rate": 0.000120663685573076, |
|
"loss": 0.0879, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.6000352961938937, |
|
"grad_norm": 0.8550868034362793, |
|
"learning_rate": 0.00011995763709108024, |
|
"loss": 0.1087, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.602388375786811, |
|
"grad_norm": 0.8301357626914978, |
|
"learning_rate": 0.00011925158860908447, |
|
"loss": 0.1266, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.6047414553797282, |
|
"grad_norm": 0.4070800542831421, |
|
"learning_rate": 0.00011854554012708871, |
|
"loss": 0.1062, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.6070945349726454, |
|
"grad_norm": 1.1967391967773438, |
|
"learning_rate": 0.00011783949164509294, |
|
"loss": 0.1147, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.6094476145655627, |
|
"grad_norm": 0.5281302332878113, |
|
"learning_rate": 0.00011713344316309718, |
|
"loss": 0.0912, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.6118006941584799, |
|
"grad_norm": 0.5271784067153931, |
|
"learning_rate": 0.00011642739468110142, |
|
"loss": 0.1084, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6141537737513971, |
|
"grad_norm": 0.4973151683807373, |
|
"learning_rate": 0.00011572134619910566, |
|
"loss": 0.1242, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.6165068533443143, |
|
"grad_norm": 0.4281303882598877, |
|
"learning_rate": 0.0001150152977171099, |
|
"loss": 0.1101, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6188599329372316, |
|
"grad_norm": 0.5142689347267151, |
|
"learning_rate": 0.00011430924923511413, |
|
"loss": 0.1182, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.6212130125301488, |
|
"grad_norm": 0.5125661492347717, |
|
"learning_rate": 0.00011360320075311837, |
|
"loss": 0.0943, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.623566092123066, |
|
"grad_norm": 0.43077680468559265, |
|
"learning_rate": 0.0001128971522711226, |
|
"loss": 0.0948, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.6259191717159833, |
|
"grad_norm": 0.5074141621589661, |
|
"learning_rate": 0.00011219110378912684, |
|
"loss": 0.0853, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.6282722513089005, |
|
"grad_norm": 0.8260855674743652, |
|
"learning_rate": 0.00011148505530713107, |
|
"loss": 0.1104, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.6306253309018177, |
|
"grad_norm": 0.7819215059280396, |
|
"learning_rate": 0.00011077900682513531, |
|
"loss": 0.1256, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.632978410494735, |
|
"grad_norm": 0.46884438395500183, |
|
"learning_rate": 0.00011007295834313955, |
|
"loss": 0.1027, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.6353314900876522, |
|
"grad_norm": 0.9515593647956848, |
|
"learning_rate": 0.00010936690986114378, |
|
"loss": 0.112, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6376845696805694, |
|
"grad_norm": 0.3602767586708069, |
|
"learning_rate": 0.00010866086137914803, |
|
"loss": 0.1053, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.6400376492734867, |
|
"grad_norm": 0.7740781903266907, |
|
"learning_rate": 0.00010795481289715226, |
|
"loss": 0.112, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.6423907288664039, |
|
"grad_norm": 0.5003033876419067, |
|
"learning_rate": 0.0001072487644151565, |
|
"loss": 0.0985, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.6447438084593211, |
|
"grad_norm": 0.4092664122581482, |
|
"learning_rate": 0.00010654271593316073, |
|
"loss": 0.1113, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.6470968880522384, |
|
"grad_norm": 0.446584552526474, |
|
"learning_rate": 0.00010583666745116497, |
|
"loss": 0.0909, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.6494499676451556, |
|
"grad_norm": 0.3130131661891937, |
|
"learning_rate": 0.0001051306189691692, |
|
"loss": 0.0954, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.6518030472380728, |
|
"grad_norm": 0.7232083082199097, |
|
"learning_rate": 0.00010442457048717344, |
|
"loss": 0.1132, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.65415612683099, |
|
"grad_norm": 0.5579691529273987, |
|
"learning_rate": 0.00010371852200517768, |
|
"loss": 0.1045, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.6565092064239073, |
|
"grad_norm": 0.5319089889526367, |
|
"learning_rate": 0.00010301247352318191, |
|
"loss": 0.1215, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.6588622860168245, |
|
"grad_norm": 0.516445517539978, |
|
"learning_rate": 0.00010230642504118615, |
|
"loss": 0.111, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6612153656097417, |
|
"grad_norm": 0.25264236330986023, |
|
"learning_rate": 0.0001016003765591904, |
|
"loss": 0.1126, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.663568445202659, |
|
"grad_norm": 0.7910987138748169, |
|
"learning_rate": 0.00010089432807719463, |
|
"loss": 0.1306, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.6659215247955762, |
|
"grad_norm": 0.7823461890220642, |
|
"learning_rate": 0.00010018827959519886, |
|
"loss": 0.0967, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.6682746043884934, |
|
"grad_norm": 0.7126127481460571, |
|
"learning_rate": 9.94822311132031e-05, |
|
"loss": 0.1296, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.6706276839814107, |
|
"grad_norm": 0.9327739477157593, |
|
"learning_rate": 9.877618263120733e-05, |
|
"loss": 0.1115, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.6729807635743279, |
|
"grad_norm": 0.7680268883705139, |
|
"learning_rate": 9.807013414921157e-05, |
|
"loss": 0.1055, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.6753338431672451, |
|
"grad_norm": 0.7711540460586548, |
|
"learning_rate": 9.73640856672158e-05, |
|
"loss": 0.0951, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.6776869227601624, |
|
"grad_norm": 0.5041959881782532, |
|
"learning_rate": 9.665803718522004e-05, |
|
"loss": 0.1087, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.6800400023530796, |
|
"grad_norm": 0.5102591514587402, |
|
"learning_rate": 9.595198870322428e-05, |
|
"loss": 0.1176, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.6823930819459968, |
|
"grad_norm": 0.7100384831428528, |
|
"learning_rate": 9.524594022122851e-05, |
|
"loss": 0.1091, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.6847461615389141, |
|
"grad_norm": 0.6806867122650146, |
|
"learning_rate": 9.453989173923276e-05, |
|
"loss": 0.1251, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.6870992411318313, |
|
"grad_norm": 0.6659530401229858, |
|
"learning_rate": 9.3833843257237e-05, |
|
"loss": 0.0835, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.6894523207247485, |
|
"grad_norm": 0.4317012429237366, |
|
"learning_rate": 9.312779477524123e-05, |
|
"loss": 0.0893, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.6918054003176658, |
|
"grad_norm": 0.5916824340820312, |
|
"learning_rate": 9.242174629324546e-05, |
|
"loss": 0.0905, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.694158479910583, |
|
"grad_norm": 0.7429795265197754, |
|
"learning_rate": 9.17156978112497e-05, |
|
"loss": 0.1063, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.6965115595035002, |
|
"grad_norm": 0.87420254945755, |
|
"learning_rate": 9.100964932925394e-05, |
|
"loss": 0.1042, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.6988646390964174, |
|
"grad_norm": 0.49567267298698425, |
|
"learning_rate": 9.030360084725817e-05, |
|
"loss": 0.1045, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.7012177186893347, |
|
"grad_norm": 2.633138418197632, |
|
"learning_rate": 8.95975523652624e-05, |
|
"loss": 0.0884, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.7035707982822519, |
|
"grad_norm": 0.33752286434173584, |
|
"learning_rate": 8.889150388326664e-05, |
|
"loss": 0.0848, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.7059238778751691, |
|
"grad_norm": 0.5974826812744141, |
|
"learning_rate": 8.818545540127088e-05, |
|
"loss": 0.0971, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7082769574680864, |
|
"grad_norm": 0.43427976965904236, |
|
"learning_rate": 8.747940691927512e-05, |
|
"loss": 0.1165, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.7106300370610036, |
|
"grad_norm": 0.7770646810531616, |
|
"learning_rate": 8.677335843727936e-05, |
|
"loss": 0.1084, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.7129831166539208, |
|
"grad_norm": 0.5276495218276978, |
|
"learning_rate": 8.60673099552836e-05, |
|
"loss": 0.111, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.7153361962468381, |
|
"grad_norm": 0.9737383127212524, |
|
"learning_rate": 8.536126147328783e-05, |
|
"loss": 0.0972, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.7176892758397553, |
|
"grad_norm": 0.36562997102737427, |
|
"learning_rate": 8.465521299129207e-05, |
|
"loss": 0.093, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.7200423554326725, |
|
"grad_norm": 0.8244528770446777, |
|
"learning_rate": 8.39491645092963e-05, |
|
"loss": 0.1263, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.7223954350255898, |
|
"grad_norm": 1.9532008171081543, |
|
"learning_rate": 8.324311602730054e-05, |
|
"loss": 0.1251, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.724748514618507, |
|
"grad_norm": 0.572896420955658, |
|
"learning_rate": 8.253706754530477e-05, |
|
"loss": 0.0875, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.7271015942114242, |
|
"grad_norm": 1.2975929975509644, |
|
"learning_rate": 8.1831019063309e-05, |
|
"loss": 0.1023, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.7294546738043415, |
|
"grad_norm": 0.5758102536201477, |
|
"learning_rate": 8.112497058131324e-05, |
|
"loss": 0.1019, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7318077533972587, |
|
"grad_norm": 0.553327202796936, |
|
"learning_rate": 8.041892209931749e-05, |
|
"loss": 0.1128, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.7341608329901759, |
|
"grad_norm": 0.5465438961982727, |
|
"learning_rate": 7.971287361732173e-05, |
|
"loss": 0.1237, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.7365139125830932, |
|
"grad_norm": 0.46917715668678284, |
|
"learning_rate": 7.900682513532596e-05, |
|
"loss": 0.0964, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.7388669921760104, |
|
"grad_norm": 0.8454899787902832, |
|
"learning_rate": 7.83007766533302e-05, |
|
"loss": 0.0901, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.7412200717689276, |
|
"grad_norm": 0.8698781728744507, |
|
"learning_rate": 7.759472817133443e-05, |
|
"loss": 0.1119, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.7435731513618448, |
|
"grad_norm": 1.7399003505706787, |
|
"learning_rate": 7.688867968933867e-05, |
|
"loss": 0.1122, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.7459262309547621, |
|
"grad_norm": 0.4506986141204834, |
|
"learning_rate": 7.61826312073429e-05, |
|
"loss": 0.09, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.7482793105476793, |
|
"grad_norm": 0.7856936454772949, |
|
"learning_rate": 7.547658272534714e-05, |
|
"loss": 0.1194, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.7506323901405965, |
|
"grad_norm": 0.5778619050979614, |
|
"learning_rate": 7.477053424335137e-05, |
|
"loss": 0.1167, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.7529854697335138, |
|
"grad_norm": 0.4940952658653259, |
|
"learning_rate": 7.40644857613556e-05, |
|
"loss": 0.0813, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.755338549326431, |
|
"grad_norm": 1.1496696472167969, |
|
"learning_rate": 7.335843727935984e-05, |
|
"loss": 0.0966, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.7576916289193482, |
|
"grad_norm": 0.4551859498023987, |
|
"learning_rate": 7.265238879736408e-05, |
|
"loss": 0.0956, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.7600447085122655, |
|
"grad_norm": 0.5476594567298889, |
|
"learning_rate": 7.194634031536831e-05, |
|
"loss": 0.1225, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.7623977881051827, |
|
"grad_norm": 0.4413054287433624, |
|
"learning_rate": 7.124029183337255e-05, |
|
"loss": 0.0996, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.7647508676980999, |
|
"grad_norm": 0.6522489190101624, |
|
"learning_rate": 7.053424335137678e-05, |
|
"loss": 0.0946, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.7671039472910172, |
|
"grad_norm": 0.4750779867172241, |
|
"learning_rate": 6.982819486938102e-05, |
|
"loss": 0.0788, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.7694570268839344, |
|
"grad_norm": 0.336505264043808, |
|
"learning_rate": 6.912214638738527e-05, |
|
"loss": 0.0841, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.7718101064768516, |
|
"grad_norm": 1.4274874925613403, |
|
"learning_rate": 6.84160979053895e-05, |
|
"loss": 0.0991, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.7741631860697689, |
|
"grad_norm": 0.6464115977287292, |
|
"learning_rate": 6.771004942339374e-05, |
|
"loss": 0.1172, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.7765162656626861, |
|
"grad_norm": 0.35535725951194763, |
|
"learning_rate": 6.700400094139797e-05, |
|
"loss": 0.09, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.7788693452556033, |
|
"grad_norm": 0.22626227140426636, |
|
"learning_rate": 6.62979524594022e-05, |
|
"loss": 0.089, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.7812224248485204, |
|
"grad_norm": 0.6091925501823425, |
|
"learning_rate": 6.559190397740644e-05, |
|
"loss": 0.0851, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.7835755044414378, |
|
"grad_norm": 2.3381729125976562, |
|
"learning_rate": 6.488585549541068e-05, |
|
"loss": 0.1001, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.785928584034355, |
|
"grad_norm": 0.41597291827201843, |
|
"learning_rate": 6.417980701341491e-05, |
|
"loss": 0.0985, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.7882816636272721, |
|
"grad_norm": 0.6187950372695923, |
|
"learning_rate": 6.347375853141915e-05, |
|
"loss": 0.0877, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.7906347432201895, |
|
"grad_norm": 0.4807620942592621, |
|
"learning_rate": 6.276771004942338e-05, |
|
"loss": 0.1074, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.7929878228131066, |
|
"grad_norm": 0.2998965382575989, |
|
"learning_rate": 6.206166156742763e-05, |
|
"loss": 0.1044, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.7953409024060238, |
|
"grad_norm": 0.5904129147529602, |
|
"learning_rate": 6.135561308543187e-05, |
|
"loss": 0.1073, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.7976939819989411, |
|
"grad_norm": 0.6356788277626038, |
|
"learning_rate": 6.06495646034361e-05, |
|
"loss": 0.1121, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.8000470615918583, |
|
"grad_norm": 0.9147433638572693, |
|
"learning_rate": 5.994351612144034e-05, |
|
"loss": 0.1103, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.8024001411847755, |
|
"grad_norm": 0.8032605051994324, |
|
"learning_rate": 5.923746763944457e-05, |
|
"loss": 0.0965, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.8047532207776928, |
|
"grad_norm": 0.7935906052589417, |
|
"learning_rate": 5.853141915744881e-05, |
|
"loss": 0.1007, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.80710630037061, |
|
"grad_norm": 0.4112412631511688, |
|
"learning_rate": 5.782537067545304e-05, |
|
"loss": 0.0882, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.8094593799635272, |
|
"grad_norm": 0.8190514445304871, |
|
"learning_rate": 5.7119322193457284e-05, |
|
"loss": 0.1019, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.8118124595564445, |
|
"grad_norm": 0.6029698848724365, |
|
"learning_rate": 5.641327371146152e-05, |
|
"loss": 0.1053, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.8141655391493617, |
|
"grad_norm": 0.43347781896591187, |
|
"learning_rate": 5.5707225229465755e-05, |
|
"loss": 0.1044, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.8165186187422789, |
|
"grad_norm": 1.5235440731048584, |
|
"learning_rate": 5.500117674746999e-05, |
|
"loss": 0.0982, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.8188716983351962, |
|
"grad_norm": 0.5716174244880676, |
|
"learning_rate": 5.4295128265474225e-05, |
|
"loss": 0.1078, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.8212247779281134, |
|
"grad_norm": 1.5008090734481812, |
|
"learning_rate": 5.358907978347847e-05, |
|
"loss": 0.0915, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.8235778575210306, |
|
"grad_norm": 0.49782001972198486, |
|
"learning_rate": 5.28830313014827e-05, |
|
"loss": 0.0892, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8259309371139478, |
|
"grad_norm": 0.4466950297355652, |
|
"learning_rate": 5.217698281948694e-05, |
|
"loss": 0.0905, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.8282840167068651, |
|
"grad_norm": 0.5504721403121948, |
|
"learning_rate": 5.147093433749117e-05, |
|
"loss": 0.1128, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.8306370962997823, |
|
"grad_norm": 0.4870951473712921, |
|
"learning_rate": 5.076488585549541e-05, |
|
"loss": 0.0876, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.8329901758926995, |
|
"grad_norm": 0.6789172887802124, |
|
"learning_rate": 5.005883737349965e-05, |
|
"loss": 0.1004, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.8353432554856168, |
|
"grad_norm": 0.5021870136260986, |
|
"learning_rate": 4.935278889150388e-05, |
|
"loss": 0.0862, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 0.5829181671142578, |
|
"learning_rate": 4.864674040950811e-05, |
|
"loss": 0.0994, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.8400494146714512, |
|
"grad_norm": 1.029181957244873, |
|
"learning_rate": 4.794069192751235e-05, |
|
"loss": 0.1144, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.8424024942643685, |
|
"grad_norm": 0.6730376482009888, |
|
"learning_rate": 4.723464344551658e-05, |
|
"loss": 0.106, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.8447555738572857, |
|
"grad_norm": 0.6129499673843384, |
|
"learning_rate": 4.652859496352082e-05, |
|
"loss": 0.0896, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.8471086534502029, |
|
"grad_norm": 0.422830194234848, |
|
"learning_rate": 4.582254648152506e-05, |
|
"loss": 0.0846, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.8494617330431202, |
|
"grad_norm": 0.5306664109230042, |
|
"learning_rate": 4.5116497999529296e-05, |
|
"loss": 0.1059, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.8518148126360374, |
|
"grad_norm": 0.6436883807182312, |
|
"learning_rate": 4.441044951753353e-05, |
|
"loss": 0.1132, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.8541678922289546, |
|
"grad_norm": 0.4121890962123871, |
|
"learning_rate": 4.3704401035537766e-05, |
|
"loss": 0.0864, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.8565209718218719, |
|
"grad_norm": 0.42521169781684875, |
|
"learning_rate": 4.2998352553542e-05, |
|
"loss": 0.1011, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.8588740514147891, |
|
"grad_norm": 0.49623095989227295, |
|
"learning_rate": 4.229230407154624e-05, |
|
"loss": 0.1093, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.8612271310077063, |
|
"grad_norm": 0.5516742467880249, |
|
"learning_rate": 4.158625558955048e-05, |
|
"loss": 0.1009, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.8635802106006236, |
|
"grad_norm": 0.37128451466560364, |
|
"learning_rate": 4.0880207107554713e-05, |
|
"loss": 0.0717, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.8659332901935408, |
|
"grad_norm": 0.3802624046802521, |
|
"learning_rate": 4.017415862555895e-05, |
|
"loss": 0.0891, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.868286369786458, |
|
"grad_norm": 0.35558944940567017, |
|
"learning_rate": 3.9468110143563184e-05, |
|
"loss": 0.0863, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.8706394493793752, |
|
"grad_norm": 0.2548139989376068, |
|
"learning_rate": 3.8762061661567426e-05, |
|
"loss": 0.0946, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.8729925289722925, |
|
"grad_norm": 0.3489900827407837, |
|
"learning_rate": 3.805601317957166e-05, |
|
"loss": 0.0794, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.8753456085652097, |
|
"grad_norm": 0.7514833807945251, |
|
"learning_rate": 3.7349964697575896e-05, |
|
"loss": 0.1026, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.8776986881581269, |
|
"grad_norm": 0.28846803307533264, |
|
"learning_rate": 3.664391621558013e-05, |
|
"loss": 0.107, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.8800517677510442, |
|
"grad_norm": 0.3054257333278656, |
|
"learning_rate": 3.5937867733584366e-05, |
|
"loss": 0.0839, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.8824048473439614, |
|
"grad_norm": 0.487393856048584, |
|
"learning_rate": 3.523181925158861e-05, |
|
"loss": 0.099, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.8847579269368786, |
|
"grad_norm": 0.7874276041984558, |
|
"learning_rate": 3.4525770769592843e-05, |
|
"loss": 0.0873, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.8871110065297959, |
|
"grad_norm": 0.3583498001098633, |
|
"learning_rate": 3.381972228759708e-05, |
|
"loss": 0.0854, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.8894640861227131, |
|
"grad_norm": 0.5606823563575745, |
|
"learning_rate": 3.3113673805601314e-05, |
|
"loss": 0.1106, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.8918171657156303, |
|
"grad_norm": 0.48208296298980713, |
|
"learning_rate": 3.240762532360555e-05, |
|
"loss": 0.1138, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.8941702453085476, |
|
"grad_norm": 1.026995301246643, |
|
"learning_rate": 3.170157684160979e-05, |
|
"loss": 0.0877, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.8965233249014648, |
|
"grad_norm": 0.7940952777862549, |
|
"learning_rate": 3.0995528359614026e-05, |
|
"loss": 0.069, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.7711090445518494, |
|
"learning_rate": 3.028947987761826e-05, |
|
"loss": 0.0884, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.9012294840872993, |
|
"grad_norm": 0.6985650062561035, |
|
"learning_rate": 2.9583431395622496e-05, |
|
"loss": 0.0929, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.9035825636802165, |
|
"grad_norm": 0.5291894674301147, |
|
"learning_rate": 2.8877382913626735e-05, |
|
"loss": 0.1166, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.9059356432731337, |
|
"grad_norm": 0.3929837644100189, |
|
"learning_rate": 2.817133443163097e-05, |
|
"loss": 0.0904, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.9082887228660509, |
|
"grad_norm": 0.492017537355423, |
|
"learning_rate": 2.746528594963521e-05, |
|
"loss": 0.0986, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.9106418024589682, |
|
"grad_norm": 0.5756918787956238, |
|
"learning_rate": 2.6759237467639444e-05, |
|
"loss": 0.0897, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.9129948820518854, |
|
"grad_norm": 0.5781024098396301, |
|
"learning_rate": 2.605318898564368e-05, |
|
"loss": 0.0799, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.9153479616448026, |
|
"grad_norm": 0.28270334005355835, |
|
"learning_rate": 2.5347140503647918e-05, |
|
"loss": 0.0889, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.9177010412377199, |
|
"grad_norm": 0.5788043737411499, |
|
"learning_rate": 2.464109202165215e-05, |
|
"loss": 0.0788, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9200541208306371, |
|
"grad_norm": 0.563836932182312, |
|
"learning_rate": 2.3935043539656384e-05, |
|
"loss": 0.0914, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.9224072004235543, |
|
"grad_norm": 0.4077290892601013, |
|
"learning_rate": 2.3228995057660623e-05, |
|
"loss": 0.1057, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.9247602800164716, |
|
"grad_norm": 0.6209468841552734, |
|
"learning_rate": 2.2522946575664858e-05, |
|
"loss": 0.0812, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.9271133596093888, |
|
"grad_norm": 0.542506754398346, |
|
"learning_rate": 2.1816898093669097e-05, |
|
"loss": 0.0951, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.929466439202306, |
|
"grad_norm": 0.5754973292350769, |
|
"learning_rate": 2.1110849611673332e-05, |
|
"loss": 0.1023, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.9318195187952233, |
|
"grad_norm": 0.3798030912876129, |
|
"learning_rate": 2.0404801129677567e-05, |
|
"loss": 0.0997, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.9341725983881405, |
|
"grad_norm": 0.6593634486198425, |
|
"learning_rate": 1.9698752647681806e-05, |
|
"loss": 0.1056, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.9365256779810577, |
|
"grad_norm": 0.46481505036354065, |
|
"learning_rate": 1.899270416568604e-05, |
|
"loss": 0.0795, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.938878757573975, |
|
"grad_norm": 0.5140686631202698, |
|
"learning_rate": 1.828665568369028e-05, |
|
"loss": 0.0809, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.9412318371668922, |
|
"grad_norm": 0.8201892375946045, |
|
"learning_rate": 1.7580607201694515e-05, |
|
"loss": 0.0851, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9435849167598094, |
|
"grad_norm": 0.3848946690559387, |
|
"learning_rate": 1.687455871969875e-05, |
|
"loss": 0.0789, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.9459379963527267, |
|
"grad_norm": 0.3362932503223419, |
|
"learning_rate": 1.6168510237702988e-05, |
|
"loss": 0.0841, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.9482910759456439, |
|
"grad_norm": 0.400037556886673, |
|
"learning_rate": 1.5462461755707223e-05, |
|
"loss": 0.1004, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.9506441555385611, |
|
"grad_norm": 0.6505069136619568, |
|
"learning_rate": 1.475641327371146e-05, |
|
"loss": 0.0977, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.9529972351314783, |
|
"grad_norm": 0.710784375667572, |
|
"learning_rate": 1.4050364791715697e-05, |
|
"loss": 0.085, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.9553503147243956, |
|
"grad_norm": 0.4263714849948883, |
|
"learning_rate": 1.3344316309719934e-05, |
|
"loss": 0.1044, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.9577033943173128, |
|
"grad_norm": 0.42400240898132324, |
|
"learning_rate": 1.2638267827724171e-05, |
|
"loss": 0.1113, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.96005647391023, |
|
"grad_norm": 0.2722209393978119, |
|
"learning_rate": 1.1932219345728404e-05, |
|
"loss": 0.0792, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.9624095535031473, |
|
"grad_norm": 0.9779515862464905, |
|
"learning_rate": 1.1226170863732641e-05, |
|
"loss": 0.1044, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.9647626330960645, |
|
"grad_norm": 1.028387188911438, |
|
"learning_rate": 1.0520122381736878e-05, |
|
"loss": 0.1043, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.9671157126889817, |
|
"grad_norm": 0.5009176135063171, |
|
"learning_rate": 9.814073899741115e-06, |
|
"loss": 0.087, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.969468792281899, |
|
"grad_norm": 0.33020302653312683, |
|
"learning_rate": 9.10802541774535e-06, |
|
"loss": 0.092, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.9718218718748162, |
|
"grad_norm": 0.4314991533756256, |
|
"learning_rate": 8.401976935749587e-06, |
|
"loss": 0.075, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.9741749514677334, |
|
"grad_norm": 0.6121822595596313, |
|
"learning_rate": 7.695928453753824e-06, |
|
"loss": 0.0892, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.9765280310606507, |
|
"grad_norm": 0.3374115824699402, |
|
"learning_rate": 6.989879971758061e-06, |
|
"loss": 0.0755, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.9788811106535679, |
|
"grad_norm": 0.5865825414657593, |
|
"learning_rate": 6.283831489762297e-06, |
|
"loss": 0.0957, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.9812341902464851, |
|
"grad_norm": 0.2131696194410324, |
|
"learning_rate": 5.577783007766533e-06, |
|
"loss": 0.0849, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.9835872698394024, |
|
"grad_norm": 1.3489303588867188, |
|
"learning_rate": 4.871734525770769e-06, |
|
"loss": 0.098, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.9859403494323196, |
|
"grad_norm": 0.15470068156719208, |
|
"learning_rate": 4.1656860437750056e-06, |
|
"loss": 0.0847, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.9882934290252368, |
|
"grad_norm": 0.8059414625167847, |
|
"learning_rate": 3.459637561779242e-06, |
|
"loss": 0.1094, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.990646508618154, |
|
"grad_norm": 0.6808902621269226, |
|
"learning_rate": 2.753589079783478e-06, |
|
"loss": 0.0867, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.9929995882110713, |
|
"grad_norm": 0.29802441596984863, |
|
"learning_rate": 2.0475405977877145e-06, |
|
"loss": 0.0866, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.9953526678039885, |
|
"grad_norm": 0.5227815508842468, |
|
"learning_rate": 1.341492115791951e-06, |
|
"loss": 0.0758, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.9977057473969057, |
|
"grad_norm": 0.25185248255729675, |
|
"learning_rate": 6.354436337961872e-07, |
|
"loss": 0.0906, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.9998235190305312, |
|
"step": 4249, |
|
"total_flos": 1.037665224400896e+16, |
|
"train_loss": 0.1519955188758403, |
|
"train_runtime": 5739.131, |
|
"train_samples_per_second": 11.848, |
|
"train_steps_per_second": 0.74 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4249, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 1.037665224400896e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|