|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 5592, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00894134477825465, |
|
"grad_norm": 0.5095388845429333, |
|
"learning_rate": 4.4642857142857145e-08, |
|
"loss": 1.8582, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0178826895565093, |
|
"grad_norm": 0.41826013911071735, |
|
"learning_rate": 8.928571428571429e-08, |
|
"loss": 1.8696, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02682403433476395, |
|
"grad_norm": 0.5293401027825467, |
|
"learning_rate": 1.3392857142857142e-07, |
|
"loss": 1.863, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0357653791130186, |
|
"grad_norm": 0.5080324323077997, |
|
"learning_rate": 1.7857142857142858e-07, |
|
"loss": 1.8639, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.044706723891273246, |
|
"grad_norm": 0.5015276110428923, |
|
"learning_rate": 2.232142857142857e-07, |
|
"loss": 1.8466, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0536480686695279, |
|
"grad_norm": 0.5192534238774392, |
|
"learning_rate": 2.6785714285714284e-07, |
|
"loss": 1.8646, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06258941344778254, |
|
"grad_norm": 0.5036209286188108, |
|
"learning_rate": 3.1249999999999997e-07, |
|
"loss": 1.8602, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0715307582260372, |
|
"grad_norm": 0.4912106467940169, |
|
"learning_rate": 3.5714285714285716e-07, |
|
"loss": 1.8529, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08047210300429185, |
|
"grad_norm": 0.9830108153719349, |
|
"learning_rate": 4.017857142857143e-07, |
|
"loss": 1.8574, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.08941344778254649, |
|
"grad_norm": 0.5138072684249293, |
|
"learning_rate": 4.464285714285714e-07, |
|
"loss": 1.8498, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09835479256080114, |
|
"grad_norm": 0.6040471717328045, |
|
"learning_rate": 4.910714285714285e-07, |
|
"loss": 1.853, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.1072961373390558, |
|
"grad_norm": 0.6457522376165349, |
|
"learning_rate": 5.357142857142857e-07, |
|
"loss": 1.849, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11623748211731044, |
|
"grad_norm": 0.6334455155915341, |
|
"learning_rate": 5.803571428571429e-07, |
|
"loss": 1.8404, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.1251788268955651, |
|
"grad_norm": 0.6379008256976824, |
|
"learning_rate": 6.249999999999999e-07, |
|
"loss": 1.8591, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13412017167381973, |
|
"grad_norm": 0.7299148775857892, |
|
"learning_rate": 6.69642857142857e-07, |
|
"loss": 1.8275, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.1430615164520744, |
|
"grad_norm": 0.7276311175701822, |
|
"learning_rate": 7.142857142857143e-07, |
|
"loss": 1.7953, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15200286123032905, |
|
"grad_norm": 0.7388527300740464, |
|
"learning_rate": 7.589285714285714e-07, |
|
"loss": 1.7676, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.1609442060085837, |
|
"grad_norm": 0.8114217398767946, |
|
"learning_rate": 8.035714285714286e-07, |
|
"loss": 1.7693, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16988555078683834, |
|
"grad_norm": 0.8300068303598062, |
|
"learning_rate": 8.482142857142857e-07, |
|
"loss": 1.7249, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.17882689556509299, |
|
"grad_norm": 0.7407086667105495, |
|
"learning_rate": 8.928571428571428e-07, |
|
"loss": 1.7008, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18776824034334763, |
|
"grad_norm": 0.6185854806094319, |
|
"learning_rate": 9.374999999999999e-07, |
|
"loss": 1.6907, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.19670958512160228, |
|
"grad_norm": 0.6491552501216883, |
|
"learning_rate": 9.82142857142857e-07, |
|
"loss": 1.6586, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.20565092989985695, |
|
"grad_norm": 0.6005414444540486, |
|
"learning_rate": 9.970190779014309e-07, |
|
"loss": 1.6455, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.2145922746781116, |
|
"grad_norm": 0.584948620571865, |
|
"learning_rate": 9.920508744038154e-07, |
|
"loss": 1.6181, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.22353361945636624, |
|
"grad_norm": 0.5689579459890339, |
|
"learning_rate": 9.870826709062002e-07, |
|
"loss": 1.5861, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.23247496423462088, |
|
"grad_norm": 0.5479875081985119, |
|
"learning_rate": 9.821144674085851e-07, |
|
"loss": 1.5784, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24141630901287553, |
|
"grad_norm": 0.5642004770422899, |
|
"learning_rate": 9.771462639109697e-07, |
|
"loss": 1.5524, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.2503576537911302, |
|
"grad_norm": 0.5928876776119925, |
|
"learning_rate": 9.721780604133544e-07, |
|
"loss": 1.5371, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2592989985693848, |
|
"grad_norm": 0.5977759151905918, |
|
"learning_rate": 9.672098569157392e-07, |
|
"loss": 1.5132, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.26824034334763946, |
|
"grad_norm": 0.5575061860434638, |
|
"learning_rate": 9.62241653418124e-07, |
|
"loss": 1.5064, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2771816881258941, |
|
"grad_norm": 0.5630188145960676, |
|
"learning_rate": 9.572734499205087e-07, |
|
"loss": 1.4761, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.2861230329041488, |
|
"grad_norm": 0.5611716812815478, |
|
"learning_rate": 9.523052464228934e-07, |
|
"loss": 1.4614, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.29506437768240346, |
|
"grad_norm": 0.46795354610537326, |
|
"learning_rate": 9.473370429252782e-07, |
|
"loss": 1.4298, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.3040057224606581, |
|
"grad_norm": 0.44554190069369415, |
|
"learning_rate": 9.423688394276629e-07, |
|
"loss": 1.4318, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.31294706723891275, |
|
"grad_norm": 1.3485889211129667, |
|
"learning_rate": 9.374006359300477e-07, |
|
"loss": 1.4183, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.3218884120171674, |
|
"grad_norm": 0.5688806165123188, |
|
"learning_rate": 9.324324324324324e-07, |
|
"loss": 1.3918, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.33082975679542204, |
|
"grad_norm": 0.5874567786702063, |
|
"learning_rate": 9.274642289348172e-07, |
|
"loss": 1.3886, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3397711015736767, |
|
"grad_norm": 0.3766300486264232, |
|
"learning_rate": 9.224960254372018e-07, |
|
"loss": 1.3839, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3487124463519313, |
|
"grad_norm": 0.39812336031911777, |
|
"learning_rate": 9.175278219395866e-07, |
|
"loss": 1.3677, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.35765379113018597, |
|
"grad_norm": 0.3843707797396072, |
|
"learning_rate": 9.125596184419714e-07, |
|
"loss": 1.358, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3665951359084406, |
|
"grad_norm": 0.3847089855262342, |
|
"learning_rate": 9.075914149443561e-07, |
|
"loss": 1.3469, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.37553648068669526, |
|
"grad_norm": 0.5315974805229721, |
|
"learning_rate": 9.026232114467408e-07, |
|
"loss": 1.3377, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3844778254649499, |
|
"grad_norm": 0.37212834114959115, |
|
"learning_rate": 8.976550079491256e-07, |
|
"loss": 1.326, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.39341917024320455, |
|
"grad_norm": 0.3369886595418098, |
|
"learning_rate": 8.926868044515103e-07, |
|
"loss": 1.3141, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.40236051502145925, |
|
"grad_norm": 0.3232053255299734, |
|
"learning_rate": 8.877186009538951e-07, |
|
"loss": 1.3176, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.4113018597997139, |
|
"grad_norm": 0.5405167043382479, |
|
"learning_rate": 8.827503974562798e-07, |
|
"loss": 1.3125, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.42024320457796854, |
|
"grad_norm": 0.28257166431534414, |
|
"learning_rate": 8.777821939586645e-07, |
|
"loss": 1.2911, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.4291845493562232, |
|
"grad_norm": 0.29182342370487174, |
|
"learning_rate": 8.728139904610492e-07, |
|
"loss": 1.2935, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.43812589413447783, |
|
"grad_norm": 0.22773442408616737, |
|
"learning_rate": 8.678457869634341e-07, |
|
"loss": 1.2888, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.4470672389127325, |
|
"grad_norm": 0.22436263710764737, |
|
"learning_rate": 8.628775834658187e-07, |
|
"loss": 1.2745, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4560085836909871, |
|
"grad_norm": 0.21775793386766282, |
|
"learning_rate": 8.579093799682035e-07, |
|
"loss": 1.269, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.46494992846924177, |
|
"grad_norm": 0.22214064111096374, |
|
"learning_rate": 8.529411764705882e-07, |
|
"loss": 1.267, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4738912732474964, |
|
"grad_norm": 0.20243266736371351, |
|
"learning_rate": 8.47972972972973e-07, |
|
"loss": 1.276, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.48283261802575106, |
|
"grad_norm": 0.20705292614851298, |
|
"learning_rate": 8.430047694753577e-07, |
|
"loss": 1.272, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.4917739628040057, |
|
"grad_norm": 0.19119742077379076, |
|
"learning_rate": 8.380365659777425e-07, |
|
"loss": 1.2724, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.5007153075822603, |
|
"grad_norm": 0.23794007928706434, |
|
"learning_rate": 8.330683624801271e-07, |
|
"loss": 1.2583, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.509656652360515, |
|
"grad_norm": 0.18340837203234114, |
|
"learning_rate": 8.281001589825118e-07, |
|
"loss": 1.2499, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.5185979971387696, |
|
"grad_norm": 0.193128543688102, |
|
"learning_rate": 8.231319554848967e-07, |
|
"loss": 1.2612, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.5275393419170243, |
|
"grad_norm": 0.1751575826741444, |
|
"learning_rate": 8.181637519872813e-07, |
|
"loss": 1.2502, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.5364806866952789, |
|
"grad_norm": 0.2364503348095419, |
|
"learning_rate": 8.131955484896661e-07, |
|
"loss": 1.243, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5454220314735336, |
|
"grad_norm": 0.25652812638575234, |
|
"learning_rate": 8.082273449920508e-07, |
|
"loss": 1.2559, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.5543633762517882, |
|
"grad_norm": 0.1737798761662478, |
|
"learning_rate": 8.032591414944355e-07, |
|
"loss": 1.2529, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5633047210300429, |
|
"grad_norm": 0.16909231999113233, |
|
"learning_rate": 7.982909379968203e-07, |
|
"loss": 1.2425, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.5722460658082976, |
|
"grad_norm": 0.16621585163311706, |
|
"learning_rate": 7.933227344992051e-07, |
|
"loss": 1.2428, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5811874105865522, |
|
"grad_norm": 0.29941509287671747, |
|
"learning_rate": 7.883545310015897e-07, |
|
"loss": 1.2501, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.5901287553648069, |
|
"grad_norm": 0.17035369815136836, |
|
"learning_rate": 7.833863275039745e-07, |
|
"loss": 1.2515, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5990701001430615, |
|
"grad_norm": 0.17629538126188918, |
|
"learning_rate": 7.784181240063593e-07, |
|
"loss": 1.2463, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.6080114449213162, |
|
"grad_norm": 0.1697497467753232, |
|
"learning_rate": 7.73449920508744e-07, |
|
"loss": 1.2311, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6169527896995708, |
|
"grad_norm": 0.15981908691911512, |
|
"learning_rate": 7.684817170111287e-07, |
|
"loss": 1.2449, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.6258941344778255, |
|
"grad_norm": 0.1529795356453295, |
|
"learning_rate": 7.635135135135135e-07, |
|
"loss": 1.2558, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6348354792560801, |
|
"grad_norm": 0.1609652679207339, |
|
"learning_rate": 7.585453100158981e-07, |
|
"loss": 1.238, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.6437768240343348, |
|
"grad_norm": 0.14621942114492062, |
|
"learning_rate": 7.53577106518283e-07, |
|
"loss": 1.2281, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6527181688125894, |
|
"grad_norm": 0.1618220669804669, |
|
"learning_rate": 7.486089030206677e-07, |
|
"loss": 1.2362, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.6616595135908441, |
|
"grad_norm": 0.16201225587717186, |
|
"learning_rate": 7.436406995230524e-07, |
|
"loss": 1.2261, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.6706008583690987, |
|
"grad_norm": 0.16891334320378984, |
|
"learning_rate": 7.386724960254371e-07, |
|
"loss": 1.2233, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.6795422031473534, |
|
"grad_norm": 0.15836253423551233, |
|
"learning_rate": 7.33704292527822e-07, |
|
"loss": 1.2041, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6884835479256081, |
|
"grad_norm": 0.16173299579695544, |
|
"learning_rate": 7.287360890302066e-07, |
|
"loss": 1.2112, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.6974248927038627, |
|
"grad_norm": 0.15311569879619952, |
|
"learning_rate": 7.237678855325914e-07, |
|
"loss": 1.2045, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.7063662374821174, |
|
"grad_norm": 0.1454174761720946, |
|
"learning_rate": 7.187996820349761e-07, |
|
"loss": 1.2142, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.7153075822603719, |
|
"grad_norm": 0.14358617606783605, |
|
"learning_rate": 7.138314785373608e-07, |
|
"loss": 1.2254, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7242489270386266, |
|
"grad_norm": 0.15123594116890207, |
|
"learning_rate": 7.088632750397456e-07, |
|
"loss": 1.2043, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.7331902718168812, |
|
"grad_norm": 0.1484178022383261, |
|
"learning_rate": 7.038950715421304e-07, |
|
"loss": 1.2102, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.7421316165951359, |
|
"grad_norm": 0.15779752084285467, |
|
"learning_rate": 6.98926868044515e-07, |
|
"loss": 1.219, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.7510729613733905, |
|
"grad_norm": 0.14639571745115018, |
|
"learning_rate": 6.939586645468998e-07, |
|
"loss": 1.2053, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7600143061516452, |
|
"grad_norm": 0.18349123256744296, |
|
"learning_rate": 6.889904610492846e-07, |
|
"loss": 1.2154, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.7689556509298998, |
|
"grad_norm": 0.1564150328615537, |
|
"learning_rate": 6.840222575516693e-07, |
|
"loss": 1.2004, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.7778969957081545, |
|
"grad_norm": 0.15591360624463577, |
|
"learning_rate": 6.79054054054054e-07, |
|
"loss": 1.2097, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.7868383404864091, |
|
"grad_norm": 0.1525230509402887, |
|
"learning_rate": 6.740858505564388e-07, |
|
"loss": 1.2009, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7957796852646638, |
|
"grad_norm": 0.13632266793566936, |
|
"learning_rate": 6.691176470588234e-07, |
|
"loss": 1.2169, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.8047210300429185, |
|
"grad_norm": 0.165619157435053, |
|
"learning_rate": 6.641494435612083e-07, |
|
"loss": 1.2032, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.8136623748211731, |
|
"grad_norm": 0.1382749276518515, |
|
"learning_rate": 6.59181240063593e-07, |
|
"loss": 1.1998, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.8226037195994278, |
|
"grad_norm": 0.15894676488709247, |
|
"learning_rate": 6.542130365659777e-07, |
|
"loss": 1.2046, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8315450643776824, |
|
"grad_norm": 0.14139510562282273, |
|
"learning_rate": 6.492448330683624e-07, |
|
"loss": 1.2022, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.8404864091559371, |
|
"grad_norm": 0.16356442362082368, |
|
"learning_rate": 6.442766295707473e-07, |
|
"loss": 1.1888, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8494277539341917, |
|
"grad_norm": 0.13001938286669296, |
|
"learning_rate": 6.393084260731319e-07, |
|
"loss": 1.2036, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.8583690987124464, |
|
"grad_norm": 0.1376171055331258, |
|
"learning_rate": 6.343402225755167e-07, |
|
"loss": 1.1967, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.867310443490701, |
|
"grad_norm": 0.1555553003869712, |
|
"learning_rate": 6.293720190779014e-07, |
|
"loss": 1.202, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.8762517882689557, |
|
"grad_norm": 0.1536545724212523, |
|
"learning_rate": 6.24403815580286e-07, |
|
"loss": 1.1993, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.8851931330472103, |
|
"grad_norm": 0.15017510970577388, |
|
"learning_rate": 6.194356120826709e-07, |
|
"loss": 1.1878, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.894134477825465, |
|
"grad_norm": 0.1501666434092056, |
|
"learning_rate": 6.144674085850557e-07, |
|
"loss": 1.1855, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9030758226037195, |
|
"grad_norm": 0.14083948154628234, |
|
"learning_rate": 6.094992050874403e-07, |
|
"loss": 1.1858, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.9120171673819742, |
|
"grad_norm": 0.1480314047950012, |
|
"learning_rate": 6.04531001589825e-07, |
|
"loss": 1.1718, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.920958512160229, |
|
"grad_norm": 0.14172604863631122, |
|
"learning_rate": 5.995627980922098e-07, |
|
"loss": 1.1819, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.9298998569384835, |
|
"grad_norm": 0.12782051260136137, |
|
"learning_rate": 5.945945945945947e-07, |
|
"loss": 1.1956, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.9388412017167382, |
|
"grad_norm": 0.1451085538055058, |
|
"learning_rate": 5.896263910969793e-07, |
|
"loss": 1.1744, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.9477825464949928, |
|
"grad_norm": 0.40345434298159216, |
|
"learning_rate": 5.84658187599364e-07, |
|
"loss": 1.1783, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.9567238912732475, |
|
"grad_norm": 0.13772381526739436, |
|
"learning_rate": 5.796899841017488e-07, |
|
"loss": 1.1839, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.9656652360515021, |
|
"grad_norm": 0.15802899189862257, |
|
"learning_rate": 5.747217806041335e-07, |
|
"loss": 1.1792, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9746065808297568, |
|
"grad_norm": 0.1467263131203197, |
|
"learning_rate": 5.697535771065183e-07, |
|
"loss": 1.1881, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.9835479256080114, |
|
"grad_norm": 0.18086918496034873, |
|
"learning_rate": 5.64785373608903e-07, |
|
"loss": 1.1847, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.9924892703862661, |
|
"grad_norm": 0.13322108072315525, |
|
"learning_rate": 5.598171701112877e-07, |
|
"loss": 1.174, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.0014306151645207, |
|
"grad_norm": 0.13758424586995027, |
|
"learning_rate": 5.548489666136724e-07, |
|
"loss": 1.1744, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.0103719599427754, |
|
"grad_norm": 0.14695511905283026, |
|
"learning_rate": 5.498807631160573e-07, |
|
"loss": 1.1753, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 1.01931330472103, |
|
"grad_norm": 0.13583673106446567, |
|
"learning_rate": 5.449125596184419e-07, |
|
"loss": 1.177, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.0282546494992848, |
|
"grad_norm": 0.14863281353489802, |
|
"learning_rate": 5.399443561208267e-07, |
|
"loss": 1.1698, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 1.0371959942775393, |
|
"grad_norm": 0.1719230111879567, |
|
"learning_rate": 5.349761526232114e-07, |
|
"loss": 1.17, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.046137339055794, |
|
"grad_norm": 0.14198220020823107, |
|
"learning_rate": 5.300079491255962e-07, |
|
"loss": 1.1599, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 1.0550786838340487, |
|
"grad_norm": 0.14656880263067942, |
|
"learning_rate": 5.250397456279809e-07, |
|
"loss": 1.1613, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.0640200286123034, |
|
"grad_norm": 0.17984614771854038, |
|
"learning_rate": 5.200715421303657e-07, |
|
"loss": 1.1614, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 1.0729613733905579, |
|
"grad_norm": 0.16125844481926355, |
|
"learning_rate": 5.151033386327503e-07, |
|
"loss": 1.1456, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.0819027181688126, |
|
"grad_norm": 0.15302132985828396, |
|
"learning_rate": 5.101351351351351e-07, |
|
"loss": 1.1582, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 1.0908440629470673, |
|
"grad_norm": 0.17416017697667222, |
|
"learning_rate": 5.051669316375199e-07, |
|
"loss": 1.1645, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.099785407725322, |
|
"grad_norm": 0.1576441191719269, |
|
"learning_rate": 5.001987281399046e-07, |
|
"loss": 1.1614, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 1.1087267525035764, |
|
"grad_norm": 0.15708931920894004, |
|
"learning_rate": 4.952305246422893e-07, |
|
"loss": 1.1589, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.1176680972818311, |
|
"grad_norm": 0.18088098015223084, |
|
"learning_rate": 4.902623211446741e-07, |
|
"loss": 1.1516, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 1.1266094420600858, |
|
"grad_norm": 0.15266497982250174, |
|
"learning_rate": 4.852941176470588e-07, |
|
"loss": 1.1432, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.1355507868383405, |
|
"grad_norm": 0.16769944079369944, |
|
"learning_rate": 4.803259141494435e-07, |
|
"loss": 1.151, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 1.144492131616595, |
|
"grad_norm": 0.183750642878192, |
|
"learning_rate": 4.7535771065182827e-07, |
|
"loss": 1.1629, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.1534334763948497, |
|
"grad_norm": 0.15008294024377883, |
|
"learning_rate": 4.70389507154213e-07, |
|
"loss": 1.1497, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 1.1623748211731044, |
|
"grad_norm": 0.1792182775394022, |
|
"learning_rate": 4.6542130365659777e-07, |
|
"loss": 1.1442, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.1713161659513591, |
|
"grad_norm": 0.1815534124009153, |
|
"learning_rate": 4.6045310015898247e-07, |
|
"loss": 1.1424, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 1.1802575107296138, |
|
"grad_norm": 0.1634530123009294, |
|
"learning_rate": 4.5548489666136727e-07, |
|
"loss": 1.1445, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.1891988555078683, |
|
"grad_norm": 0.15595048975318404, |
|
"learning_rate": 4.5051669316375196e-07, |
|
"loss": 1.1471, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 1.198140200286123, |
|
"grad_norm": 0.15375493348738128, |
|
"learning_rate": 4.455484896661367e-07, |
|
"loss": 1.1559, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.2070815450643777, |
|
"grad_norm": 0.15595330109052555, |
|
"learning_rate": 4.4058028616852146e-07, |
|
"loss": 1.1454, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 1.2160228898426324, |
|
"grad_norm": 0.1531828453320636, |
|
"learning_rate": 4.3561208267090616e-07, |
|
"loss": 1.1447, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.224964234620887, |
|
"grad_norm": 0.13496531066929612, |
|
"learning_rate": 4.306438791732909e-07, |
|
"loss": 1.1228, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 1.2339055793991416, |
|
"grad_norm": 0.16876704458729294, |
|
"learning_rate": 4.2567567567567566e-07, |
|
"loss": 1.1267, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.2428469241773963, |
|
"grad_norm": 0.35531759007502683, |
|
"learning_rate": 4.207074721780604e-07, |
|
"loss": 1.1282, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 1.251788268955651, |
|
"grad_norm": 0.14942448217724147, |
|
"learning_rate": 4.157392686804451e-07, |
|
"loss": 1.1321, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.2607296137339055, |
|
"grad_norm": 0.16823840198797532, |
|
"learning_rate": 4.107710651828299e-07, |
|
"loss": 1.1297, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 1.2696709585121602, |
|
"grad_norm": 0.1373099361223873, |
|
"learning_rate": 4.058028616852146e-07, |
|
"loss": 1.1289, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.2786123032904149, |
|
"grad_norm": 0.1238133749616558, |
|
"learning_rate": 4.0083465818759935e-07, |
|
"loss": 1.1197, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 1.2875536480686696, |
|
"grad_norm": 0.16232957883088006, |
|
"learning_rate": 3.958664546899841e-07, |
|
"loss": 1.1517, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.2964949928469243, |
|
"grad_norm": 0.15052195486822473, |
|
"learning_rate": 3.908982511923688e-07, |
|
"loss": 1.1115, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 1.3054363376251787, |
|
"grad_norm": 0.1464299159190494, |
|
"learning_rate": 3.8593004769475355e-07, |
|
"loss": 1.121, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.3143776824034334, |
|
"grad_norm": 0.16556774569358795, |
|
"learning_rate": 3.809618441971383e-07, |
|
"loss": 1.1248, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 1.3233190271816881, |
|
"grad_norm": 0.14677494246881193, |
|
"learning_rate": 3.7599364069952305e-07, |
|
"loss": 1.1301, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.3322603719599428, |
|
"grad_norm": 0.1385988595636303, |
|
"learning_rate": 3.7102543720190775e-07, |
|
"loss": 1.1217, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 1.3412017167381975, |
|
"grad_norm": 0.132770703438204, |
|
"learning_rate": 3.6605723370429255e-07, |
|
"loss": 1.128, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.350143061516452, |
|
"grad_norm": 0.14471226652721184, |
|
"learning_rate": 3.6108903020667724e-07, |
|
"loss": 1.1258, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 1.3590844062947067, |
|
"grad_norm": 0.15829087639433673, |
|
"learning_rate": 3.5612082670906194e-07, |
|
"loss": 1.1232, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.3680257510729614, |
|
"grad_norm": 0.14881510104209672, |
|
"learning_rate": 3.5115262321144674e-07, |
|
"loss": 1.1326, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 1.376967095851216, |
|
"grad_norm": 0.14588466142604242, |
|
"learning_rate": 3.4618441971383144e-07, |
|
"loss": 1.1164, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.3859084406294706, |
|
"grad_norm": 0.13477416021173907, |
|
"learning_rate": 3.412162162162162e-07, |
|
"loss": 1.1285, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 1.3948497854077253, |
|
"grad_norm": 0.21722908586608208, |
|
"learning_rate": 3.3624801271860094e-07, |
|
"loss": 1.1201, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.40379113018598, |
|
"grad_norm": 0.13558184653823538, |
|
"learning_rate": 3.312798092209857e-07, |
|
"loss": 1.1156, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 1.4127324749642347, |
|
"grad_norm": 0.14185323455679683, |
|
"learning_rate": 3.263116057233704e-07, |
|
"loss": 1.1177, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.4216738197424892, |
|
"grad_norm": 0.14569471186135233, |
|
"learning_rate": 3.213434022257552e-07, |
|
"loss": 1.117, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 1.4306151645207439, |
|
"grad_norm": 0.1303851673327878, |
|
"learning_rate": 3.163751987281399e-07, |
|
"loss": 1.1194, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.4395565092989986, |
|
"grad_norm": 0.13823108648889798, |
|
"learning_rate": 3.114069952305246e-07, |
|
"loss": 1.1153, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 1.4484978540772533, |
|
"grad_norm": 0.14544212946759183, |
|
"learning_rate": 3.064387917329094e-07, |
|
"loss": 1.1043, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.457439198855508, |
|
"grad_norm": 0.1290933120501116, |
|
"learning_rate": 3.014705882352941e-07, |
|
"loss": 1.1213, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 1.4663805436337625, |
|
"grad_norm": 0.12809510682493896, |
|
"learning_rate": 2.965023847376789e-07, |
|
"loss": 1.113, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.4753218884120172, |
|
"grad_norm": 0.1419793306501672, |
|
"learning_rate": 2.915341812400636e-07, |
|
"loss": 1.1309, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 1.4842632331902719, |
|
"grad_norm": 0.13406734402981146, |
|
"learning_rate": 2.8656597774244833e-07, |
|
"loss": 1.1256, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.4932045779685263, |
|
"grad_norm": 0.13587793689155384, |
|
"learning_rate": 2.815977742448331e-07, |
|
"loss": 1.1187, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 1.5021459227467813, |
|
"grad_norm": 0.13258607901887373, |
|
"learning_rate": 2.766295707472178e-07, |
|
"loss": 1.1228, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.5110872675250357, |
|
"grad_norm": 0.1247874755702546, |
|
"learning_rate": 2.716613672496025e-07, |
|
"loss": 1.1164, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 1.5200286123032904, |
|
"grad_norm": 0.12239855788322539, |
|
"learning_rate": 2.666931637519873e-07, |
|
"loss": 1.1117, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.5289699570815452, |
|
"grad_norm": 0.1369183999332412, |
|
"learning_rate": 2.61724960254372e-07, |
|
"loss": 1.1172, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 1.5379113018597996, |
|
"grad_norm": 1.1332621790402788, |
|
"learning_rate": 2.567567567567567e-07, |
|
"loss": 1.0993, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.5468526466380543, |
|
"grad_norm": 0.1253338454363431, |
|
"learning_rate": 2.517885532591415e-07, |
|
"loss": 1.1165, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 1.555793991416309, |
|
"grad_norm": 0.12665869326621307, |
|
"learning_rate": 2.468203497615262e-07, |
|
"loss": 1.1075, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.5647353361945635, |
|
"grad_norm": 0.12307463927377256, |
|
"learning_rate": 2.4185214626391097e-07, |
|
"loss": 1.1091, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 1.5736766809728184, |
|
"grad_norm": 0.13556347702963126, |
|
"learning_rate": 2.368839427662957e-07, |
|
"loss": 1.1106, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.582618025751073, |
|
"grad_norm": 0.1437851683344807, |
|
"learning_rate": 2.3191573926868044e-07, |
|
"loss": 1.1258, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 1.5915593705293276, |
|
"grad_norm": 0.1483471734540286, |
|
"learning_rate": 2.269475357710652e-07, |
|
"loss": 1.1185, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.6005007153075823, |
|
"grad_norm": 0.1417713425850489, |
|
"learning_rate": 2.2197933227344991e-07, |
|
"loss": 1.1091, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 1.6094420600858368, |
|
"grad_norm": 0.11258187644158707, |
|
"learning_rate": 2.1701112877583466e-07, |
|
"loss": 1.1141, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.6183834048640917, |
|
"grad_norm": 1.4844013392798963, |
|
"learning_rate": 2.120429252782194e-07, |
|
"loss": 1.1083, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 1.6273247496423462, |
|
"grad_norm": 0.14485866801127223, |
|
"learning_rate": 2.070747217806041e-07, |
|
"loss": 1.1128, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.636266094420601, |
|
"grad_norm": 0.1310091219191068, |
|
"learning_rate": 2.0210651828298886e-07, |
|
"loss": 1.1081, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 1.6452074391988556, |
|
"grad_norm": 0.1324864798139755, |
|
"learning_rate": 1.971383147853736e-07, |
|
"loss": 1.1218, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.65414878397711, |
|
"grad_norm": 0.16408287715823638, |
|
"learning_rate": 1.9217011128775833e-07, |
|
"loss": 1.1128, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 1.6630901287553648, |
|
"grad_norm": 0.13453997589781927, |
|
"learning_rate": 1.8720190779014308e-07, |
|
"loss": 1.109, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.6720314735336195, |
|
"grad_norm": 0.11431592419320451, |
|
"learning_rate": 1.8223370429252783e-07, |
|
"loss": 1.1128, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 1.680972818311874, |
|
"grad_norm": 0.12937079012352717, |
|
"learning_rate": 1.7726550079491255e-07, |
|
"loss": 1.107, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.6899141630901289, |
|
"grad_norm": 0.13493617023722984, |
|
"learning_rate": 1.7229729729729728e-07, |
|
"loss": 1.1026, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 1.6988555078683834, |
|
"grad_norm": 0.12427164940365912, |
|
"learning_rate": 1.6732909379968203e-07, |
|
"loss": 1.1002, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.707796852646638, |
|
"grad_norm": 0.12582353400466703, |
|
"learning_rate": 1.6236089030206675e-07, |
|
"loss": 1.1091, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 1.7167381974248928, |
|
"grad_norm": 0.1209250119003803, |
|
"learning_rate": 1.573926868044515e-07, |
|
"loss": 1.0988, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.7256795422031472, |
|
"grad_norm": 0.1416217314930489, |
|
"learning_rate": 1.5242448330683625e-07, |
|
"loss": 1.1083, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 1.7346208869814022, |
|
"grad_norm": 0.13057929166120621, |
|
"learning_rate": 1.4745627980922097e-07, |
|
"loss": 1.1164, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.7435622317596566, |
|
"grad_norm": 0.13275668946657931, |
|
"learning_rate": 1.4248807631160572e-07, |
|
"loss": 1.1164, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 1.7525035765379113, |
|
"grad_norm": 0.19255405164097672, |
|
"learning_rate": 1.3751987281399047e-07, |
|
"loss": 1.0945, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.761444921316166, |
|
"grad_norm": 0.13958161437492914, |
|
"learning_rate": 1.3255166931637517e-07, |
|
"loss": 1.1094, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 1.7703862660944205, |
|
"grad_norm": 0.1379908503816713, |
|
"learning_rate": 1.2758346581875992e-07, |
|
"loss": 1.1027, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.7793276108726752, |
|
"grad_norm": 0.1146403989881727, |
|
"learning_rate": 1.2261526232114467e-07, |
|
"loss": 1.0966, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 1.78826895565093, |
|
"grad_norm": 0.1296359557728573, |
|
"learning_rate": 1.176470588235294e-07, |
|
"loss": 1.1163, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.7972103004291844, |
|
"grad_norm": 0.1242709050610396, |
|
"learning_rate": 1.1267885532591414e-07, |
|
"loss": 1.109, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 1.8061516452074393, |
|
"grad_norm": 0.12401932092728496, |
|
"learning_rate": 1.0771065182829889e-07, |
|
"loss": 1.0986, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.8150929899856938, |
|
"grad_norm": 0.1268226445976438, |
|
"learning_rate": 1.0274244833068361e-07, |
|
"loss": 1.1023, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 1.8240343347639485, |
|
"grad_norm": 0.12184905639120043, |
|
"learning_rate": 9.777424483306836e-08, |
|
"loss": 1.0985, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.8329756795422032, |
|
"grad_norm": 0.12734274404323123, |
|
"learning_rate": 9.28060413354531e-08, |
|
"loss": 1.1154, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 1.8419170243204577, |
|
"grad_norm": 0.13235924797466722, |
|
"learning_rate": 8.783783783783784e-08, |
|
"loss": 1.098, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.8508583690987126, |
|
"grad_norm": 0.11628761020569767, |
|
"learning_rate": 8.286963434022257e-08, |
|
"loss": 1.106, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 1.859799713876967, |
|
"grad_norm": 0.19214819884604664, |
|
"learning_rate": 7.790143084260731e-08, |
|
"loss": 1.0889, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.8687410586552218, |
|
"grad_norm": 0.487598636016336, |
|
"learning_rate": 7.293322734499204e-08, |
|
"loss": 1.1045, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 1.8776824034334765, |
|
"grad_norm": 0.13127933734364353, |
|
"learning_rate": 6.79650238473768e-08, |
|
"loss": 1.1017, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.886623748211731, |
|
"grad_norm": 0.1314482800625486, |
|
"learning_rate": 6.299682034976152e-08, |
|
"loss": 1.1044, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 1.8955650929899857, |
|
"grad_norm": 0.12676434160993588, |
|
"learning_rate": 5.802861685214626e-08, |
|
"loss": 1.1043, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.9045064377682404, |
|
"grad_norm": 0.1323305276547178, |
|
"learning_rate": 5.3060413354531e-08, |
|
"loss": 1.1026, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 1.9134477825464948, |
|
"grad_norm": 0.13391474060102793, |
|
"learning_rate": 4.809220985691573e-08, |
|
"loss": 1.0969, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.9223891273247498, |
|
"grad_norm": 0.12245753375271169, |
|
"learning_rate": 4.3124006359300475e-08, |
|
"loss": 1.1081, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 1.9313304721030042, |
|
"grad_norm": 0.11951679306957765, |
|
"learning_rate": 3.815580286168521e-08, |
|
"loss": 1.1092, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.940271816881259, |
|
"grad_norm": 0.11961090020470673, |
|
"learning_rate": 3.3187599364069955e-08, |
|
"loss": 1.1007, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 1.9492131616595136, |
|
"grad_norm": 0.12422327512042645, |
|
"learning_rate": 2.8219395866454688e-08, |
|
"loss": 1.098, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.9581545064377681, |
|
"grad_norm": 0.125577214373895, |
|
"learning_rate": 2.3251192368839427e-08, |
|
"loss": 1.1009, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 1.967095851216023, |
|
"grad_norm": 0.12978760398009362, |
|
"learning_rate": 1.8282988871224164e-08, |
|
"loss": 1.0965, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.9760371959942775, |
|
"grad_norm": 0.15452388925779964, |
|
"learning_rate": 1.3314785373608903e-08, |
|
"loss": 1.1058, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 1.9849785407725322, |
|
"grad_norm": 0.12170241302022762, |
|
"learning_rate": 8.346581875993641e-09, |
|
"loss": 1.1066, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.993919885550787, |
|
"grad_norm": 0.13568263553279267, |
|
"learning_rate": 3.3783783783783785e-09, |
|
"loss": 1.0966, |
|
"step": 5575 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 5592, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 1.1563888596221952e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|