|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 8975, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011142061281337047, |
|
"grad_norm": 0.04801425710320473, |
|
"learning_rate": 2.7859033291544785e-07, |
|
"loss": 2.4677, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.022284122562674095, |
|
"grad_norm": 0.04515479877591133, |
|
"learning_rate": 5.571806658308957e-07, |
|
"loss": 2.465, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.033426183844011144, |
|
"grad_norm": 0.04905751347541809, |
|
"learning_rate": 8.357709987463436e-07, |
|
"loss": 2.479, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.04456824512534819, |
|
"grad_norm": 0.0585104376077652, |
|
"learning_rate": 1.1143613316617914e-06, |
|
"loss": 2.4715, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.055710306406685235, |
|
"grad_norm": 0.06588415056467056, |
|
"learning_rate": 1.3929516645772392e-06, |
|
"loss": 2.4912, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06685236768802229, |
|
"grad_norm": 0.07128334790468216, |
|
"learning_rate": 1.6715419974926873e-06, |
|
"loss": 2.449, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07799442896935933, |
|
"grad_norm": 0.08448737859725952, |
|
"learning_rate": 1.9501323304081347e-06, |
|
"loss": 2.4613, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.08913649025069638, |
|
"grad_norm": 0.09213274717330933, |
|
"learning_rate": 2.2287226633235828e-06, |
|
"loss": 2.4366, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.10027855153203342, |
|
"grad_norm": 0.10825659334659576, |
|
"learning_rate": 2.507312996239031e-06, |
|
"loss": 2.4342, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.11142061281337047, |
|
"grad_norm": 0.11989778280258179, |
|
"learning_rate": 2.7859033291544785e-06, |
|
"loss": 2.4227, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12256267409470752, |
|
"grad_norm": 0.11828930675983429, |
|
"learning_rate": 3.064493662069926e-06, |
|
"loss": 2.4346, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.13370473537604458, |
|
"grad_norm": 0.13713237643241882, |
|
"learning_rate": 3.3430839949853746e-06, |
|
"loss": 2.3954, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.14484679665738162, |
|
"grad_norm": 0.13783149421215057, |
|
"learning_rate": 3.6216743279008222e-06, |
|
"loss": 2.4027, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.15598885793871867, |
|
"grad_norm": 0.14380280673503876, |
|
"learning_rate": 3.9002646608162694e-06, |
|
"loss": 2.3894, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.1671309192200557, |
|
"grad_norm": 0.1586257964372635, |
|
"learning_rate": 4.178854993731718e-06, |
|
"loss": 2.3923, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.17827298050139276, |
|
"grad_norm": 0.15052290260791779, |
|
"learning_rate": 4.4574453266471655e-06, |
|
"loss": 2.3792, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.1894150417827298, |
|
"grad_norm": 0.16659782826900482, |
|
"learning_rate": 4.736035659562614e-06, |
|
"loss": 2.3657, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.20055710306406685, |
|
"grad_norm": 0.16844528913497925, |
|
"learning_rate": 5.014625992478062e-06, |
|
"loss": 2.3748, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.2116991643454039, |
|
"grad_norm": 0.17568887770175934, |
|
"learning_rate": 5.29321632539351e-06, |
|
"loss": 2.3609, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.22284122562674094, |
|
"grad_norm": 0.19198767840862274, |
|
"learning_rate": 5.571806658308957e-06, |
|
"loss": 2.3544, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.233983286908078, |
|
"grad_norm": 0.19439826905727386, |
|
"learning_rate": 5.850396991224405e-06, |
|
"loss": 2.3589, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.24512534818941503, |
|
"grad_norm": 0.2022334635257721, |
|
"learning_rate": 6.128987324139852e-06, |
|
"loss": 2.3612, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.2562674094707521, |
|
"grad_norm": 0.20067375898361206, |
|
"learning_rate": 6.4075776570553e-06, |
|
"loss": 2.3417, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.26740947075208915, |
|
"grad_norm": 0.2165047824382782, |
|
"learning_rate": 6.686167989970749e-06, |
|
"loss": 2.3481, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.2785515320334262, |
|
"grad_norm": 0.21726970374584198, |
|
"learning_rate": 6.9647583228861955e-06, |
|
"loss": 2.3331, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.28969359331476324, |
|
"grad_norm": 0.21808430552482605, |
|
"learning_rate": 7.2433486558016444e-06, |
|
"loss": 2.336, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.3008356545961003, |
|
"grad_norm": 0.23211392760276794, |
|
"learning_rate": 7.5219389887170925e-06, |
|
"loss": 2.3332, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.31197771587743733, |
|
"grad_norm": 0.24134476482868195, |
|
"learning_rate": 7.800529321632539e-06, |
|
"loss": 2.3377, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.3231197771587744, |
|
"grad_norm": 0.2478109449148178, |
|
"learning_rate": 8.079119654547987e-06, |
|
"loss": 2.3209, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.3342618384401114, |
|
"grad_norm": 0.25040534138679504, |
|
"learning_rate": 8.357709987463437e-06, |
|
"loss": 2.3218, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.34540389972144847, |
|
"grad_norm": 0.25530460476875305, |
|
"learning_rate": 8.636300320378883e-06, |
|
"loss": 2.3063, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.3565459610027855, |
|
"grad_norm": 0.258489727973938, |
|
"learning_rate": 8.914890653294331e-06, |
|
"loss": 2.3218, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.36768802228412256, |
|
"grad_norm": 0.28399255871772766, |
|
"learning_rate": 9.193480986209779e-06, |
|
"loss": 2.3063, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.3788300835654596, |
|
"grad_norm": 0.2863540053367615, |
|
"learning_rate": 9.472071319125227e-06, |
|
"loss": 2.2997, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.38997214484679665, |
|
"grad_norm": 0.2868782877922058, |
|
"learning_rate": 9.750661652040675e-06, |
|
"loss": 2.2947, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4011142061281337, |
|
"grad_norm": 0.2859346270561218, |
|
"learning_rate": 1.0029251984956123e-05, |
|
"loss": 2.3054, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.41225626740947074, |
|
"grad_norm": 0.29307645559310913, |
|
"learning_rate": 1.030784231787157e-05, |
|
"loss": 2.2882, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.4233983286908078, |
|
"grad_norm": 0.3006058633327484, |
|
"learning_rate": 1.058643265078702e-05, |
|
"loss": 2.2893, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.43454038997214484, |
|
"grad_norm": 0.31586816906929016, |
|
"learning_rate": 1.0865022983702467e-05, |
|
"loss": 2.2967, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.4456824512534819, |
|
"grad_norm": 0.3375140428543091, |
|
"learning_rate": 1.1143613316617914e-05, |
|
"loss": 2.2694, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4568245125348189, |
|
"grad_norm": 0.30628982186317444, |
|
"learning_rate": 1.142220364953336e-05, |
|
"loss": 2.2709, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.467966573816156, |
|
"grad_norm": 0.34296101331710815, |
|
"learning_rate": 1.170079398244881e-05, |
|
"loss": 2.2783, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.479108635097493, |
|
"grad_norm": 0.33544307947158813, |
|
"learning_rate": 1.1979384315364258e-05, |
|
"loss": 2.2825, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.49025069637883006, |
|
"grad_norm": 0.3508812487125397, |
|
"learning_rate": 1.2257974648279704e-05, |
|
"loss": 2.2793, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.5013927576601671, |
|
"grad_norm": 0.34851330518722534, |
|
"learning_rate": 1.2536564981195154e-05, |
|
"loss": 2.2703, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5125348189415042, |
|
"grad_norm": 0.3589787185192108, |
|
"learning_rate": 1.28151553141106e-05, |
|
"loss": 2.2649, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.5236768802228412, |
|
"grad_norm": 0.3842375576496124, |
|
"learning_rate": 1.3093745647026049e-05, |
|
"loss": 2.264, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.5348189415041783, |
|
"grad_norm": 0.3963667154312134, |
|
"learning_rate": 1.3372335979941498e-05, |
|
"loss": 2.2567, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.5459610027855153, |
|
"grad_norm": 0.40061014890670776, |
|
"learning_rate": 1.3650926312856945e-05, |
|
"loss": 2.2567, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.5571030640668524, |
|
"grad_norm": 0.3819388747215271, |
|
"learning_rate": 1.3929516645772391e-05, |
|
"loss": 2.2545, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5682451253481894, |
|
"grad_norm": 0.40177789330482483, |
|
"learning_rate": 1.420810697868784e-05, |
|
"loss": 2.2509, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.5793871866295265, |
|
"grad_norm": 0.4258297085762024, |
|
"learning_rate": 1.4486697311603289e-05, |
|
"loss": 2.2466, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.5905292479108635, |
|
"grad_norm": 0.44000041484832764, |
|
"learning_rate": 1.4765287644518735e-05, |
|
"loss": 2.2453, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.6016713091922006, |
|
"grad_norm": 0.4116784632205963, |
|
"learning_rate": 1.5043877977434185e-05, |
|
"loss": 2.2708, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.6128133704735376, |
|
"grad_norm": 0.39489638805389404, |
|
"learning_rate": 1.5322468310349633e-05, |
|
"loss": 2.2656, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6239554317548747, |
|
"grad_norm": 0.43062567710876465, |
|
"learning_rate": 1.5601058643265078e-05, |
|
"loss": 2.2305, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.6350974930362117, |
|
"grad_norm": 0.4567050635814667, |
|
"learning_rate": 1.587964897618053e-05, |
|
"loss": 2.2392, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.6462395543175488, |
|
"grad_norm": 0.40272602438926697, |
|
"learning_rate": 1.6158239309095974e-05, |
|
"loss": 2.2327, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.6573816155988857, |
|
"grad_norm": 0.4182547628879547, |
|
"learning_rate": 1.6436829642011422e-05, |
|
"loss": 2.227, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.6685236768802229, |
|
"grad_norm": 0.4616607129573822, |
|
"learning_rate": 1.6715419974926873e-05, |
|
"loss": 2.223, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6796657381615598, |
|
"grad_norm": 0.4605095386505127, |
|
"learning_rate": 1.6994010307842318e-05, |
|
"loss": 2.229, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.6908077994428969, |
|
"grad_norm": 0.48064395785331726, |
|
"learning_rate": 1.7272600640757766e-05, |
|
"loss": 2.2285, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.7019498607242339, |
|
"grad_norm": 0.45375484228134155, |
|
"learning_rate": 1.7551190973673214e-05, |
|
"loss": 2.231, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.713091922005571, |
|
"grad_norm": 0.4386723041534424, |
|
"learning_rate": 1.7829781306588662e-05, |
|
"loss": 2.2295, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.724233983286908, |
|
"grad_norm": 0.4519370198249817, |
|
"learning_rate": 1.810837163950411e-05, |
|
"loss": 2.2185, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7353760445682451, |
|
"grad_norm": 0.45770174264907837, |
|
"learning_rate": 1.8386961972419558e-05, |
|
"loss": 2.2146, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.7465181058495822, |
|
"grad_norm": 0.46970096230506897, |
|
"learning_rate": 1.8665552305335006e-05, |
|
"loss": 2.2268, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.7576601671309192, |
|
"grad_norm": 0.4301040768623352, |
|
"learning_rate": 1.8944142638250454e-05, |
|
"loss": 2.2186, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.7688022284122563, |
|
"grad_norm": 0.49275335669517517, |
|
"learning_rate": 1.9222732971165902e-05, |
|
"loss": 2.2143, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.7799442896935933, |
|
"grad_norm": 0.4531406760215759, |
|
"learning_rate": 1.950132330408135e-05, |
|
"loss": 2.2087, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7910863509749304, |
|
"grad_norm": 0.44058841466903687, |
|
"learning_rate": 1.97799136369968e-05, |
|
"loss": 2.2081, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.8022284122562674, |
|
"grad_norm": 0.4620118737220764, |
|
"learning_rate": 1.9993253992196043e-05, |
|
"loss": 2.2029, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.8133704735376045, |
|
"grad_norm": 0.4749448299407959, |
|
"learning_rate": 1.9776845355951603e-05, |
|
"loss": 2.1974, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.8245125348189415, |
|
"grad_norm": 0.4482609033584595, |
|
"learning_rate": 1.9262050899616325e-05, |
|
"loss": 2.2229, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.8356545961002786, |
|
"grad_norm": 0.4689159095287323, |
|
"learning_rate": 1.8464581965697866e-05, |
|
"loss": 2.2081, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.8467966573816156, |
|
"grad_norm": 0.4448583126068115, |
|
"learning_rate": 1.7408777020560473e-05, |
|
"loss": 2.2181, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.8579387186629527, |
|
"grad_norm": 0.513080894947052, |
|
"learning_rate": 1.6126858853144854e-05, |
|
"loss": 2.2045, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.8690807799442897, |
|
"grad_norm": 0.4850214421749115, |
|
"learning_rate": 1.465795114698568e-05, |
|
"loss": 2.1966, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.8802228412256268, |
|
"grad_norm": 0.5757237672805786, |
|
"learning_rate": 1.3046884439396632e-05, |
|
"loss": 2.2019, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.8913649025069638, |
|
"grad_norm": 0.48911547660827637, |
|
"learning_rate": 1.1342827909521198e-05, |
|
"loss": 2.1994, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.9025069637883009, |
|
"grad_norm": 0.5143416523933411, |
|
"learning_rate": 9.597788752588108e-06, |
|
"loss": 2.207, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.9136490250696379, |
|
"grad_norm": 0.483064204454422, |
|
"learning_rate": 7.882117054290375e-06, |
|
"loss": 2.1971, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.924791086350975, |
|
"grad_norm": 0.4610692262649536, |
|
"learning_rate": 6.213603769467132e-06, |
|
"loss": 2.1977, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.935933147632312, |
|
"grad_norm": 0.50405353307724, |
|
"learning_rate": 4.660649942045826e-06, |
|
"loss": 2.1955, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.947075208913649, |
|
"grad_norm": 0.47849225997924805, |
|
"learning_rate": 3.27065116687201e-06, |
|
"loss": 2.1906, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.958217270194986, |
|
"grad_norm": 0.49040549993515015, |
|
"learning_rate": 2.0860297089355943e-06, |
|
"loss": 2.1875, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.9693593314763231, |
|
"grad_norm": 0.49369877576828003, |
|
"learning_rate": 1.1429397910307794e-06, |
|
"loss": 2.2054, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.9805013927576601, |
|
"grad_norm": 0.5104972124099731, |
|
"learning_rate": 4.701641798198353e-07, |
|
"loss": 2.2121, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.9916434540389972, |
|
"grad_norm": 0.48128950595855713, |
|
"learning_rate": 8.823574609897134e-08, |
|
"loss": 2.1928, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 8975, |
|
"total_flos": 1.3078189256852111e+18, |
|
"train_loss": 2.2911263094506222, |
|
"train_runtime": 2843.733, |
|
"train_samples_per_second": 50.495, |
|
"train_steps_per_second": 3.156 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 8975, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3078189256852111e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|