|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.34134821811936455, |
|
"eval_steps": 500, |
|
"global_step": 1590, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0021468441391155, |
|
"grad_norm": 1.7282733917236328, |
|
"learning_rate": 0.0004989265779304422, |
|
"loss": 1.4129, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004293688278231, |
|
"grad_norm": 2.1508498191833496, |
|
"learning_rate": 0.0004978531558608846, |
|
"loss": 1.2225, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006440532417346501, |
|
"grad_norm": 1.6386512517929077, |
|
"learning_rate": 0.0004967797337913268, |
|
"loss": 1.1663, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.008587376556462, |
|
"grad_norm": 1.2367421388626099, |
|
"learning_rate": 0.000495706311721769, |
|
"loss": 1.1373, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.010734220695577501, |
|
"grad_norm": 1.2300989627838135, |
|
"learning_rate": 0.0004946328896522112, |
|
"loss": 1.1143, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.012881064834693002, |
|
"grad_norm": 1.1807990074157715, |
|
"learning_rate": 0.0004935594675826536, |
|
"loss": 1.0937, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.015027908973808502, |
|
"grad_norm": 0.9375188946723938, |
|
"learning_rate": 0.0004924860455130958, |
|
"loss": 1.0732, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.017174753112924, |
|
"grad_norm": 0.9801538586616516, |
|
"learning_rate": 0.000491412623443538, |
|
"loss": 1.0369, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.019321597252039503, |
|
"grad_norm": 0.9229792356491089, |
|
"learning_rate": 0.0004903392013739802, |
|
"loss": 1.0093, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.021468441391155002, |
|
"grad_norm": 1.011305570602417, |
|
"learning_rate": 0.0004892657793044225, |
|
"loss": 1.0161, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0236152855302705, |
|
"grad_norm": 0.9356452822685242, |
|
"learning_rate": 0.00048819235723486477, |
|
"loss": 0.9939, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.025762129669386003, |
|
"grad_norm": 1.0092449188232422, |
|
"learning_rate": 0.00048711893516530704, |
|
"loss": 0.9647, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.027908973808501502, |
|
"grad_norm": 0.9663442373275757, |
|
"learning_rate": 0.0004860455130957492, |
|
"loss": 0.9595, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.030055817947617004, |
|
"grad_norm": 1.1502243280410767, |
|
"learning_rate": 0.0004849720910261915, |
|
"loss": 0.9422, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0322026620867325, |
|
"grad_norm": 0.970102846622467, |
|
"learning_rate": 0.00048389866895663376, |
|
"loss": 0.945, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.034349506225848, |
|
"grad_norm": 1.2466392517089844, |
|
"learning_rate": 0.00048282524688707604, |
|
"loss": 0.9385, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0364963503649635, |
|
"grad_norm": 1.0010186433792114, |
|
"learning_rate": 0.00048175182481751826, |
|
"loss": 0.9301, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.038643194504079006, |
|
"grad_norm": 1.2516905069351196, |
|
"learning_rate": 0.0004806784027479605, |
|
"loss": 0.919, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.040790038643194505, |
|
"grad_norm": 0.8497525453567505, |
|
"learning_rate": 0.00047960498067840275, |
|
"loss": 0.9054, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.042936882782310004, |
|
"grad_norm": 1.0371205806732178, |
|
"learning_rate": 0.00047853155860884503, |
|
"loss": 0.9109, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0450837269214255, |
|
"grad_norm": 1.3313541412353516, |
|
"learning_rate": 0.00047745813653928725, |
|
"loss": 0.9131, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.047230571060541, |
|
"grad_norm": 0.9448315501213074, |
|
"learning_rate": 0.0004763847144697295, |
|
"loss": 0.9014, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.04937741519965651, |
|
"grad_norm": 1.274882435798645, |
|
"learning_rate": 0.00047531129240017175, |
|
"loss": 0.8786, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.051524259338772006, |
|
"grad_norm": 1.3116368055343628, |
|
"learning_rate": 0.000474237870330614, |
|
"loss": 0.9075, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.053671103477887505, |
|
"grad_norm": 0.9970440864562988, |
|
"learning_rate": 0.00047316444826105624, |
|
"loss": 0.8932, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.055817947617003004, |
|
"grad_norm": 1.698472499847412, |
|
"learning_rate": 0.0004720910261914985, |
|
"loss": 0.8838, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0579647917561185, |
|
"grad_norm": 1.0129982233047485, |
|
"learning_rate": 0.0004710176041219408, |
|
"loss": 0.8779, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06011163589523401, |
|
"grad_norm": 1.0594947338104248, |
|
"learning_rate": 0.00046994418205238296, |
|
"loss": 0.8631, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06225848003434951, |
|
"grad_norm": 0.7768178582191467, |
|
"learning_rate": 0.00046887075998282524, |
|
"loss": 0.8666, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.064405324173465, |
|
"grad_norm": 0.9108049869537354, |
|
"learning_rate": 0.0004677973379132675, |
|
"loss": 0.8676, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06655216831258051, |
|
"grad_norm": 1.4127992391586304, |
|
"learning_rate": 0.0004667239158437098, |
|
"loss": 0.8951, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.068699012451696, |
|
"grad_norm": 1.1507939100265503, |
|
"learning_rate": 0.000465650493774152, |
|
"loss": 0.863, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.07084585659081151, |
|
"grad_norm": 1.1579265594482422, |
|
"learning_rate": 0.00046457707170459423, |
|
"loss": 0.8716, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.072992700729927, |
|
"grad_norm": 0.9873006343841553, |
|
"learning_rate": 0.0004635036496350365, |
|
"loss": 0.8569, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.07513954486904251, |
|
"grad_norm": 1.1990203857421875, |
|
"learning_rate": 0.0004624302275654788, |
|
"loss": 0.8776, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07728638900815801, |
|
"grad_norm": 1.1173065900802612, |
|
"learning_rate": 0.000461356805495921, |
|
"loss": 0.865, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0794332331472735, |
|
"grad_norm": 1.2493510246276855, |
|
"learning_rate": 0.0004602833834263633, |
|
"loss": 0.8609, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.08158007728638901, |
|
"grad_norm": 1.1254737377166748, |
|
"learning_rate": 0.0004592099613568055, |
|
"loss": 0.8697, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0837269214255045, |
|
"grad_norm": 1.1009331941604614, |
|
"learning_rate": 0.0004581365392872477, |
|
"loss": 0.8653, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.08587376556462001, |
|
"grad_norm": 1.3970990180969238, |
|
"learning_rate": 0.00045706311721769, |
|
"loss": 0.8542, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08802060970373551, |
|
"grad_norm": 1.278136968612671, |
|
"learning_rate": 0.00045598969514813227, |
|
"loss": 0.8485, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.090167453842851, |
|
"grad_norm": 1.3295845985412598, |
|
"learning_rate": 0.00045491627307857454, |
|
"loss": 0.8501, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.09231429798196651, |
|
"grad_norm": 1.310677170753479, |
|
"learning_rate": 0.0004538428510090167, |
|
"loss": 0.849, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.094461142121082, |
|
"grad_norm": 1.0189110040664673, |
|
"learning_rate": 0.000452769428939459, |
|
"loss": 0.8566, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.09660798626019751, |
|
"grad_norm": 1.2950178384780884, |
|
"learning_rate": 0.00045169600686990126, |
|
"loss": 0.8369, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.09875483039931301, |
|
"grad_norm": 0.8336394429206848, |
|
"learning_rate": 0.00045062258480034354, |
|
"loss": 0.853, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1009016745384285, |
|
"grad_norm": 1.1623280048370361, |
|
"learning_rate": 0.00044954916273078576, |
|
"loss": 0.8437, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.10304851867754401, |
|
"grad_norm": 1.5341142416000366, |
|
"learning_rate": 0.000448475740661228, |
|
"loss": 0.8388, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1051953628166595, |
|
"grad_norm": 1.154572606086731, |
|
"learning_rate": 0.00044740231859167025, |
|
"loss": 0.8499, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.10734220695577501, |
|
"grad_norm": 1.291874885559082, |
|
"learning_rate": 0.00044632889652211253, |
|
"loss": 0.8508, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10948905109489052, |
|
"grad_norm": 2.017030954360962, |
|
"learning_rate": 0.00044525547445255475, |
|
"loss": 0.8163, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.11163589523400601, |
|
"grad_norm": 1.2181349992752075, |
|
"learning_rate": 0.000444182052382997, |
|
"loss": 0.8304, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.11378273937312151, |
|
"grad_norm": 1.1240856647491455, |
|
"learning_rate": 0.00044310863031343925, |
|
"loss": 0.8339, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.115929583512237, |
|
"grad_norm": 1.5953660011291504, |
|
"learning_rate": 0.00044203520824388147, |
|
"loss": 0.8416, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.11807642765135251, |
|
"grad_norm": 0.9097370505332947, |
|
"learning_rate": 0.00044096178617432374, |
|
"loss": 0.8362, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.12022327179046802, |
|
"grad_norm": 1.0670212507247925, |
|
"learning_rate": 0.000439888364104766, |
|
"loss": 0.8395, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.12237011592958351, |
|
"grad_norm": 1.1179403066635132, |
|
"learning_rate": 0.0004388149420352083, |
|
"loss": 0.8477, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.12451696006869901, |
|
"grad_norm": 1.218599557876587, |
|
"learning_rate": 0.00043774151996565046, |
|
"loss": 0.8295, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1266638042078145, |
|
"grad_norm": 0.9557531476020813, |
|
"learning_rate": 0.00043666809789609274, |
|
"loss": 0.8257, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.12881064834693, |
|
"grad_norm": 0.8345034122467041, |
|
"learning_rate": 0.000435594675826535, |
|
"loss": 0.8607, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.13095749248604552, |
|
"grad_norm": 0.9946607947349548, |
|
"learning_rate": 0.0004345212537569773, |
|
"loss": 0.8173, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.13310433662516102, |
|
"grad_norm": 1.3076237440109253, |
|
"learning_rate": 0.0004334478316874195, |
|
"loss": 0.8293, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1352511807642765, |
|
"grad_norm": 1.6002768278121948, |
|
"learning_rate": 0.00043237440961786173, |
|
"loss": 0.8328, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.137398024903392, |
|
"grad_norm": 1.03147554397583, |
|
"learning_rate": 0.000431300987548304, |
|
"loss": 0.8297, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1395448690425075, |
|
"grad_norm": 1.42938232421875, |
|
"learning_rate": 0.0004302275654787463, |
|
"loss": 0.8328, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.14169171318162302, |
|
"grad_norm": 1.319884181022644, |
|
"learning_rate": 0.0004291541434091885, |
|
"loss": 0.8496, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.14383855732073852, |
|
"grad_norm": 1.289533019065857, |
|
"learning_rate": 0.0004280807213396308, |
|
"loss": 0.8171, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.145985401459854, |
|
"grad_norm": 1.4401450157165527, |
|
"learning_rate": 0.000427007299270073, |
|
"loss": 0.8259, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.1481322455989695, |
|
"grad_norm": 1.403343677520752, |
|
"learning_rate": 0.0004259338772005152, |
|
"loss": 0.8116, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.15027908973808501, |
|
"grad_norm": 1.0387822389602661, |
|
"learning_rate": 0.0004248604551309575, |
|
"loss": 0.8233, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.15242593387720052, |
|
"grad_norm": 1.0579140186309814, |
|
"learning_rate": 0.00042378703306139977, |
|
"loss": 0.8205, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.15457277801631603, |
|
"grad_norm": 1.7332643270492554, |
|
"learning_rate": 0.00042271361099184204, |
|
"loss": 0.845, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.1567196221554315, |
|
"grad_norm": 1.8401075601577759, |
|
"learning_rate": 0.0004216401889222842, |
|
"loss": 0.8441, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.158866466294547, |
|
"grad_norm": 1.3133872747421265, |
|
"learning_rate": 0.0004205667668527265, |
|
"loss": 0.8427, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.16101331043366252, |
|
"grad_norm": 2.1324663162231445, |
|
"learning_rate": 0.00041949334478316876, |
|
"loss": 0.8298, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.16316015457277802, |
|
"grad_norm": 1.1304748058319092, |
|
"learning_rate": 0.00041841992271361104, |
|
"loss": 0.836, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.16530699871189353, |
|
"grad_norm": 1.1530399322509766, |
|
"learning_rate": 0.00041734650064405326, |
|
"loss": 0.803, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.167453842851009, |
|
"grad_norm": 0.8117969632148743, |
|
"learning_rate": 0.0004162730785744955, |
|
"loss": 0.8177, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.1696006869901245, |
|
"grad_norm": 1.217517375946045, |
|
"learning_rate": 0.00041519965650493775, |
|
"loss": 0.8383, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.17174753112924002, |
|
"grad_norm": 1.2580839395523071, |
|
"learning_rate": 0.00041412623443538, |
|
"loss": 0.8257, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.17389437526835552, |
|
"grad_norm": 1.7408099174499512, |
|
"learning_rate": 0.00041305281236582225, |
|
"loss": 0.8201, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.17604121940747103, |
|
"grad_norm": 1.1754316091537476, |
|
"learning_rate": 0.0004119793902962645, |
|
"loss": 0.8094, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.1781880635465865, |
|
"grad_norm": 1.5301543474197388, |
|
"learning_rate": 0.00041090596822670675, |
|
"loss": 0.8112, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.180334907685702, |
|
"grad_norm": 0.8299456834793091, |
|
"learning_rate": 0.00040983254615714897, |
|
"loss": 0.8518, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.18248175182481752, |
|
"grad_norm": 1.3171818256378174, |
|
"learning_rate": 0.00040875912408759124, |
|
"loss": 0.8292, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.18462859596393302, |
|
"grad_norm": 1.4290481805801392, |
|
"learning_rate": 0.0004076857020180335, |
|
"loss": 0.8147, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.18677544010304853, |
|
"grad_norm": 0.9816901683807373, |
|
"learning_rate": 0.0004066122799484758, |
|
"loss": 0.825, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.188922284242164, |
|
"grad_norm": 0.8896159529685974, |
|
"learning_rate": 0.00040553885787891796, |
|
"loss": 0.8245, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.1910691283812795, |
|
"grad_norm": 1.5641008615493774, |
|
"learning_rate": 0.00040446543580936024, |
|
"loss": 0.8204, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.19321597252039502, |
|
"grad_norm": 1.174325704574585, |
|
"learning_rate": 0.0004033920137398025, |
|
"loss": 0.8046, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.19536281665951052, |
|
"grad_norm": 1.0568900108337402, |
|
"learning_rate": 0.0004023185916702448, |
|
"loss": 0.835, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.19750966079862603, |
|
"grad_norm": 1.4573074579238892, |
|
"learning_rate": 0.000401245169600687, |
|
"loss": 0.8151, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.1996565049377415, |
|
"grad_norm": 1.7658246755599976, |
|
"learning_rate": 0.00040017174753112923, |
|
"loss": 0.8012, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.201803349076857, |
|
"grad_norm": 1.3144532442092896, |
|
"learning_rate": 0.0003990983254615715, |
|
"loss": 0.8155, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.20395019321597252, |
|
"grad_norm": 1.302480697631836, |
|
"learning_rate": 0.0003980249033920137, |
|
"loss": 0.8125, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.20609703735508803, |
|
"grad_norm": 1.6297829151153564, |
|
"learning_rate": 0.000396951481322456, |
|
"loss": 0.8157, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.20824388149420353, |
|
"grad_norm": 1.2462539672851562, |
|
"learning_rate": 0.0003958780592528983, |
|
"loss": 0.8135, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.210390725633319, |
|
"grad_norm": 1.3543071746826172, |
|
"learning_rate": 0.0003948046371833405, |
|
"loss": 0.8154, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.21253756977243451, |
|
"grad_norm": 1.5854978561401367, |
|
"learning_rate": 0.0003937312151137827, |
|
"loss": 0.7982, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.21468441391155002, |
|
"grad_norm": 1.0589042901992798, |
|
"learning_rate": 0.000392657793044225, |
|
"loss": 0.8267, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.21683125805066553, |
|
"grad_norm": 1.226970911026001, |
|
"learning_rate": 0.00039158437097466727, |
|
"loss": 0.8055, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.21897810218978103, |
|
"grad_norm": 1.390030860900879, |
|
"learning_rate": 0.00039051094890510954, |
|
"loss": 0.8272, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2211249463288965, |
|
"grad_norm": 1.102220892906189, |
|
"learning_rate": 0.0003894375268355517, |
|
"loss": 0.8246, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.22327179046801202, |
|
"grad_norm": 1.094040870666504, |
|
"learning_rate": 0.000388364104765994, |
|
"loss": 0.814, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.22541863460712752, |
|
"grad_norm": 1.4209458827972412, |
|
"learning_rate": 0.00038729068269643626, |
|
"loss": 0.7972, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.22756547874624303, |
|
"grad_norm": 1.3925952911376953, |
|
"learning_rate": 0.0003862172606268785, |
|
"loss": 0.809, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.22971232288535853, |
|
"grad_norm": 1.0035127401351929, |
|
"learning_rate": 0.00038514383855732076, |
|
"loss": 0.8012, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.231859167024474, |
|
"grad_norm": 1.0175857543945312, |
|
"learning_rate": 0.000384070416487763, |
|
"loss": 0.7916, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.23400601116358952, |
|
"grad_norm": 1.3213493824005127, |
|
"learning_rate": 0.00038299699441820525, |
|
"loss": 0.8084, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.23615285530270502, |
|
"grad_norm": 1.4422920942306519, |
|
"learning_rate": 0.0003819235723486475, |
|
"loss": 0.8135, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.23829969944182053, |
|
"grad_norm": 1.228966474533081, |
|
"learning_rate": 0.00038085015027908975, |
|
"loss": 0.8221, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.24044654358093603, |
|
"grad_norm": 1.5089335441589355, |
|
"learning_rate": 0.000379776728209532, |
|
"loss": 0.8183, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.2425933877200515, |
|
"grad_norm": 1.2208846807479858, |
|
"learning_rate": 0.00037870330613997425, |
|
"loss": 0.7888, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.24474023185916702, |
|
"grad_norm": 1.057085633277893, |
|
"learning_rate": 0.00037762988407041647, |
|
"loss": 0.8064, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.24688707599828252, |
|
"grad_norm": 1.746360421180725, |
|
"learning_rate": 0.00037655646200085874, |
|
"loss": 0.8209, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.24903392013739803, |
|
"grad_norm": 1.4103171825408936, |
|
"learning_rate": 0.000375483039931301, |
|
"loss": 0.8161, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.25118076427651354, |
|
"grad_norm": 1.0949628353118896, |
|
"learning_rate": 0.0003744096178617433, |
|
"loss": 0.7999, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.253327608415629, |
|
"grad_norm": 1.1674295663833618, |
|
"learning_rate": 0.00037333619579218546, |
|
"loss": 0.7999, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.25547445255474455, |
|
"grad_norm": 1.729760766029358, |
|
"learning_rate": 0.00037226277372262774, |
|
"loss": 0.8091, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.25762129669386, |
|
"grad_norm": 1.3376595973968506, |
|
"learning_rate": 0.00037118935165307, |
|
"loss": 0.7909, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2597681408329755, |
|
"grad_norm": 2.1753225326538086, |
|
"learning_rate": 0.00037011592958351223, |
|
"loss": 0.7844, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.26191498497209104, |
|
"grad_norm": 1.7476351261138916, |
|
"learning_rate": 0.0003690425075139545, |
|
"loss": 0.7972, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.2640618291112065, |
|
"grad_norm": 1.241102933883667, |
|
"learning_rate": 0.00036796908544439673, |
|
"loss": 0.8046, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.26620867325032205, |
|
"grad_norm": 1.7534103393554688, |
|
"learning_rate": 0.000366895663374839, |
|
"loss": 0.7938, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.2683555173894375, |
|
"grad_norm": 1.2782504558563232, |
|
"learning_rate": 0.0003658222413052812, |
|
"loss": 0.7891, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.270502361528553, |
|
"grad_norm": 1.1518951654434204, |
|
"learning_rate": 0.0003647488192357235, |
|
"loss": 0.7995, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.27264920566766854, |
|
"grad_norm": 1.1520744562149048, |
|
"learning_rate": 0.0003636753971661658, |
|
"loss": 0.7934, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.274796049806784, |
|
"grad_norm": 1.4017630815505981, |
|
"learning_rate": 0.000362601975096608, |
|
"loss": 0.8272, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.27694289394589955, |
|
"grad_norm": 1.7796710729599, |
|
"learning_rate": 0.0003615285530270502, |
|
"loss": 0.7782, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.279089738085015, |
|
"grad_norm": 1.5225216150283813, |
|
"learning_rate": 0.0003604551309574925, |
|
"loss": 0.7978, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2812365822241305, |
|
"grad_norm": 1.0838427543640137, |
|
"learning_rate": 0.00035938170888793477, |
|
"loss": 0.8185, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.28338342636324604, |
|
"grad_norm": 1.5116959810256958, |
|
"learning_rate": 0.000358308286818377, |
|
"loss": 0.7929, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.2855302705023615, |
|
"grad_norm": 1.2074556350708008, |
|
"learning_rate": 0.0003572348647488192, |
|
"loss": 0.804, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.28767711464147705, |
|
"grad_norm": 1.004355788230896, |
|
"learning_rate": 0.0003561614426792615, |
|
"loss": 0.813, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.2898239587805925, |
|
"grad_norm": 1.4230481386184692, |
|
"learning_rate": 0.00035508802060970376, |
|
"loss": 0.7831, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.291970802919708, |
|
"grad_norm": 1.1971302032470703, |
|
"learning_rate": 0.000354014598540146, |
|
"loss": 0.7673, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 1.3551030158996582, |
|
"learning_rate": 0.00035294117647058826, |
|
"loss": 0.7757, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.296264491197939, |
|
"grad_norm": 1.0632190704345703, |
|
"learning_rate": 0.0003518677544010305, |
|
"loss": 0.7824, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.29841133533705455, |
|
"grad_norm": 1.5460542440414429, |
|
"learning_rate": 0.00035079433233147275, |
|
"loss": 0.7871, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.30055817947617003, |
|
"grad_norm": 1.8900117874145508, |
|
"learning_rate": 0.000349720910261915, |
|
"loss": 0.7967, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3027050236152855, |
|
"grad_norm": 1.5339765548706055, |
|
"learning_rate": 0.00034864748819235725, |
|
"loss": 0.7759, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.30485186775440104, |
|
"grad_norm": 1.721113681793213, |
|
"learning_rate": 0.0003475740661227995, |
|
"loss": 0.7792, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.3069987118935165, |
|
"grad_norm": 1.0442615747451782, |
|
"learning_rate": 0.0003465006440532417, |
|
"loss": 0.7734, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.30914555603263205, |
|
"grad_norm": 1.4723149538040161, |
|
"learning_rate": 0.00034542722198368397, |
|
"loss": 0.7839, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.31129240017174753, |
|
"grad_norm": 1.4786028861999512, |
|
"learning_rate": 0.00034435379991412624, |
|
"loss": 0.7995, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.313439244310863, |
|
"grad_norm": 1.392654538154602, |
|
"learning_rate": 0.0003432803778445685, |
|
"loss": 0.8046, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.31558608844997854, |
|
"grad_norm": 1.730966567993164, |
|
"learning_rate": 0.00034220695577501074, |
|
"loss": 0.7909, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.317732932589094, |
|
"grad_norm": 1.365211844444275, |
|
"learning_rate": 0.00034113353370545296, |
|
"loss": 0.7881, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.31987977672820955, |
|
"grad_norm": 1.2406139373779297, |
|
"learning_rate": 0.00034006011163589524, |
|
"loss": 0.8095, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.32202662086732503, |
|
"grad_norm": 2.0166332721710205, |
|
"learning_rate": 0.0003389866895663375, |
|
"loss": 0.7694, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3241734650064405, |
|
"grad_norm": 1.9494292736053467, |
|
"learning_rate": 0.00033791326749677973, |
|
"loss": 0.8033, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.32632030914555604, |
|
"grad_norm": 1.6693007946014404, |
|
"learning_rate": 0.000336839845427222, |
|
"loss": 0.8158, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.3284671532846715, |
|
"grad_norm": 1.595958948135376, |
|
"learning_rate": 0.00033576642335766423, |
|
"loss": 0.7974, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.33061399742378705, |
|
"grad_norm": 1.8875946998596191, |
|
"learning_rate": 0.0003346930012881065, |
|
"loss": 0.7835, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.33276084156290253, |
|
"grad_norm": 1.5482693910598755, |
|
"learning_rate": 0.0003336195792185487, |
|
"loss": 0.7866, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.334907685702018, |
|
"grad_norm": 1.1274839639663696, |
|
"learning_rate": 0.000332546157148991, |
|
"loss": 0.7964, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.33705452984113354, |
|
"grad_norm": 1.5397554636001587, |
|
"learning_rate": 0.0003314727350794333, |
|
"loss": 0.7802, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.339201373980249, |
|
"grad_norm": 1.2875391244888306, |
|
"learning_rate": 0.00033039931300987544, |
|
"loss": 0.7764, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.34134821811936455, |
|
"grad_norm": 1.0845388174057007, |
|
"learning_rate": 0.0003293258909403177, |
|
"loss": 0.7839, |
|
"step": 1590 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4658, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 30, |
|
"total_flos": 1.2016090886217754e+17, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|