|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 100, |
|
"global_step": 468, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 629.0333465576172, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 0.6769814278795893, |
|
"kl": 9.946823120117187e-05, |
|
"learning_rate": 3.1914893617021275e-07, |
|
"loss": 0.0, |
|
"reward": 0.625000013411045, |
|
"reward_std": 0.2814582511782646, |
|
"rewards/accuracy_reward": 0.625000013411045, |
|
"rewards/format_reward": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 555.6875137329101, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 1.9840891212418084, |
|
"kl": 0.0014047443866729737, |
|
"learning_rate": 6.382978723404255e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6833333499729634, |
|
"reward_std": 0.2381569817662239, |
|
"rewards/accuracy_reward": 0.6833333499729634, |
|
"rewards/format_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 596.6166839599609, |
|
"epoch": 0.032, |
|
"grad_norm": 0.32100565406672044, |
|
"kl": 0.0005277872085571289, |
|
"learning_rate": 9.574468085106384e-07, |
|
"loss": 0.0, |
|
"reward": 0.6541666820645332, |
|
"reward_std": 0.2742413729429245, |
|
"rewards/accuracy_reward": 0.6541666820645332, |
|
"rewards/format_reward": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 622.8041839599609, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 2.7595418706029604, |
|
"kl": 0.0008854150772094726, |
|
"learning_rate": 1.276595744680851e-06, |
|
"loss": 0.0, |
|
"reward": 0.6791666865348815, |
|
"reward_std": 0.2670244947075844, |
|
"rewards/accuracy_reward": 0.6750000178813934, |
|
"rewards/format_reward": 0.00416666679084301, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 632.7000167846679, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.4465988016263813, |
|
"kl": 0.0010046005249023438, |
|
"learning_rate": 1.5957446808510639e-06, |
|
"loss": 0.0, |
|
"reward": 0.6750000156462193, |
|
"reward_std": 0.20207259058952332, |
|
"rewards/accuracy_reward": 0.6750000156462193, |
|
"rewards/format_reward": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 631.6250205993653, |
|
"epoch": 0.064, |
|
"grad_norm": 5.122527862000083, |
|
"kl": 0.002363252639770508, |
|
"learning_rate": 1.9148936170212767e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7041666809469461, |
|
"reward_std": 0.2381569817662239, |
|
"rewards/accuracy_reward": 0.7041666809469461, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 617.3833492279052, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 0.4565019319043069, |
|
"kl": 0.0020374774932861326, |
|
"learning_rate": 2.2340425531914894e-06, |
|
"loss": 0.0001, |
|
"reward": 0.658333345502615, |
|
"reward_std": 0.1587713211774826, |
|
"rewards/accuracy_reward": 0.658333345502615, |
|
"rewards/format_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 605.6125198364258, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 0.1538327867321423, |
|
"kl": 0.007851552963256837, |
|
"learning_rate": 2.553191489361702e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7458333469927311, |
|
"reward_std": 0.16598819941282272, |
|
"rewards/accuracy_reward": 0.7458333469927311, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 577.2958511352539, |
|
"epoch": 0.096, |
|
"grad_norm": 0.37253452177444696, |
|
"kl": 0.021607685089111327, |
|
"learning_rate": 2.872340425531915e-06, |
|
"loss": 0.0009, |
|
"reward": 0.75416667945683, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.75416667945683, |
|
"rewards/format_reward": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 642.6583526611328, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.1770926163615656, |
|
"kl": 0.0023632049560546875, |
|
"learning_rate": 2.9996241442585123e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7375000134110451, |
|
"reward_std": 0.16598819941282272, |
|
"rewards/accuracy_reward": 0.7375000134110451, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 618.7875183105468, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 1.1185074340159897, |
|
"kl": 0.0026407241821289062, |
|
"learning_rate": 2.9973279301399446e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7083333477377891, |
|
"reward_std": 0.18042195588350296, |
|
"rewards/accuracy_reward": 0.7083333477377891, |
|
"rewards/format_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 586.7333518981934, |
|
"epoch": 0.128, |
|
"grad_norm": 0.7174325392644639, |
|
"kl": 0.003997516632080078, |
|
"learning_rate": 2.992947502998804e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7500000119209289, |
|
"reward_std": 0.1154700517654419, |
|
"rewards/accuracy_reward": 0.7500000119209289, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 622.9291847229003, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 0.670903073904803, |
|
"kl": 0.00260772705078125, |
|
"learning_rate": 2.9864889601923268e-06, |
|
"loss": 0.0001, |
|
"reward": 0.691666680201888, |
|
"reward_std": 0.21650634706020355, |
|
"rewards/accuracy_reward": 0.691666680201888, |
|
"rewards/format_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 549.3833484649658, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 0.4088284284161271, |
|
"kl": 0.004892921447753907, |
|
"learning_rate": 2.977961291721137e-06, |
|
"loss": 0.0002, |
|
"reward": 0.8541666768491268, |
|
"reward_std": 0.12268693000078201, |
|
"rewards/accuracy_reward": 0.8541666768491268, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 607.4750160217285, |
|
"epoch": 0.16, |
|
"grad_norm": 0.38437819950891683, |
|
"kl": 0.002490520477294922, |
|
"learning_rate": 2.9673763677155655e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7416666783392429, |
|
"reward_std": 0.1587713211774826, |
|
"rewards/accuracy_reward": 0.7416666783392429, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 606.7250198364258, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 0.427244505552826, |
|
"kl": 0.0034360885620117188, |
|
"learning_rate": 2.9547489219129666e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7416666761040688, |
|
"reward_std": 0.17320507764816284, |
|
"rewards/accuracy_reward": 0.7416666761040688, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 597.8541831970215, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 0.2807801163545959, |
|
"kl": 0.0037883758544921876, |
|
"learning_rate": 2.9400965311490175e-06, |
|
"loss": 0.0002, |
|
"reward": 0.75416667945683, |
|
"reward_std": 0.16598819941282272, |
|
"rewards/accuracy_reward": 0.75416667945683, |
|
"rewards/format_reward": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 596.2541862487793, |
|
"epoch": 0.192, |
|
"grad_norm": 0.521374734305685, |
|
"kl": 0.0034576416015625, |
|
"learning_rate": 2.9234395908915565e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7833333477377892, |
|
"reward_std": 0.18042195588350296, |
|
"rewards/accuracy_reward": 0.7833333477377892, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 571.3958503723145, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 0.3433615148143651, |
|
"kl": 0.003693962097167969, |
|
"learning_rate": 2.904801286851009e-06, |
|
"loss": 0.0001, |
|
"reward": 0.758333345502615, |
|
"reward_std": 0.18042195588350296, |
|
"rewards/accuracy_reward": 0.758333345502615, |
|
"rewards/format_reward": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 540.612516784668, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 1.316659347652175, |
|
"kl": 0.003926467895507812, |
|
"learning_rate": 2.884207562706925e-06, |
|
"loss": 0.0002, |
|
"reward": 0.8166666753590107, |
|
"reward_std": 0.13712068647146225, |
|
"rewards/accuracy_reward": 0.8166666753590107, |
|
"rewards/format_reward": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"eval_completion_length": 584.8229370117188, |
|
"eval_kl": 0.0649594835415008, |
|
"eval_loss": 0.0026084992568939924, |
|
"eval_reward": 0.6781449270476202, |
|
"eval_reward_std": 0.17168815557364445, |
|
"eval_rewards/accuracy_reward": 0.6781449270476202, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 2224.3049, |
|
"eval_samples_per_second": 2.248, |
|
"eval_steps_per_second": 0.024, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 555.7166831970214, |
|
"epoch": 0.224, |
|
"grad_norm": 0.27423172301140597, |
|
"kl": 0.004434013366699218, |
|
"learning_rate": 2.8616870839955444e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7791666761040688, |
|
"reward_std": 0.12268693000078201, |
|
"rewards/accuracy_reward": 0.7791666761040688, |
|
"rewards/format_reward": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 559.5791831970215, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 0.31053268918266963, |
|
"kl": 0.0049915313720703125, |
|
"learning_rate": 2.837271198208662e-06, |
|
"loss": 0.0002, |
|
"reward": 0.779166679829359, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.779166679829359, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 572.1333473205566, |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 0.5350039478758858, |
|
"kl": 0.005301475524902344, |
|
"learning_rate": 2.8109938911593322e-06, |
|
"loss": 0.0002, |
|
"reward": 0.8041666813194752, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.8041666813194752, |
|
"rewards/format_reward": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 582.1333488464355, |
|
"epoch": 0.256, |
|
"grad_norm": 0.2859156946575482, |
|
"kl": 2905.6047693252563, |
|
"learning_rate": 2.7828917396751474e-06, |
|
"loss": 116.0331, |
|
"reward": 0.7208333447575569, |
|
"reward_std": 0.1587713211774826, |
|
"rewards/accuracy_reward": 0.7208333447575569, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 530.0416824340821, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 1.1177982976359848, |
|
"kl": 0.004655075073242187, |
|
"learning_rate": 2.753003860684943e-06, |
|
"loss": 0.0002, |
|
"reward": 0.8416666783392429, |
|
"reward_std": 0.12268693000078201, |
|
"rewards/accuracy_reward": 0.8416666783392429, |
|
"rewards/format_reward": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 557.0750175476074, |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 0.4585614836700422, |
|
"kl": 0.0064525604248046875, |
|
"learning_rate": 2.721371856769793e-06, |
|
"loss": 0.0003, |
|
"reward": 0.829166679084301, |
|
"reward_std": 0.13712068647146225, |
|
"rewards/accuracy_reward": 0.829166679084301, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 587.7958473205566, |
|
"epoch": 0.288, |
|
"grad_norm": 0.0950533723109889, |
|
"kl": 0.005008697509765625, |
|
"learning_rate": 2.688039758254093e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7541666768491269, |
|
"reward_std": 0.13712068647146225, |
|
"rewards/accuracy_reward": 0.7541666768491269, |
|
"rewards/format_reward": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 543.9125160217285, |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 0.6441875290120029, |
|
"kl": 0.005208587646484375, |
|
"learning_rate": 2.65305396191733e-06, |
|
"loss": 0.0002, |
|
"reward": 0.816666679084301, |
|
"reward_std": 0.13712068647146225, |
|
"rewards/accuracy_reward": 0.816666679084301, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 569.7666854858398, |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 0.5652154440314774, |
|
"kl": 0.006348609924316406, |
|
"learning_rate": 2.61646316641186e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7750000104308128, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.7750000104308128, |
|
"rewards/format_reward": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 574.3875183105469, |
|
"epoch": 0.32, |
|
"grad_norm": 0.23645532092478114, |
|
"kl": 0.00558013916015625, |
|
"learning_rate": 2.5783183044765715e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7708333481103182, |
|
"reward_std": 0.16598819941282272, |
|
"rewards/accuracy_reward": 0.7708333481103182, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 527.1708511352539, |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 0.34192011002352146, |
|
"kl": 0.007277297973632813, |
|
"learning_rate": 2.5386724720408135e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8000000149011612, |
|
"reward_std": 0.17320507764816284, |
|
"rewards/accuracy_reward": 0.8000000149011612, |
|
"rewards/format_reward": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 498.82918281555175, |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 0.40065027769948436, |
|
"kl": 0.00552520751953125, |
|
"learning_rate": 2.49758085431725e-06, |
|
"loss": 0.0002, |
|
"reward": 0.8000000093132258, |
|
"reward_std": 0.1154700517654419, |
|
"rewards/accuracy_reward": 0.8000000093132258, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 578.9625190734863, |
|
"epoch": 0.352, |
|
"grad_norm": 0.38287823891545164, |
|
"kl": 0.004776382446289062, |
|
"learning_rate": 2.455100648986533e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7416666746139526, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.7416666746139526, |
|
"rewards/format_reward": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 523.1041854858398, |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 0.284445909761847, |
|
"kl": 0.013115310668945312, |
|
"learning_rate": 2.4112909865807053e-06, |
|
"loss": 0.0005, |
|
"reward": 0.8583333406597375, |
|
"reward_std": 0.10103629529476166, |
|
"rewards/accuracy_reward": 0.8583333406597375, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 570.7833480834961, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.23347186760127464, |
|
"kl": 0.006616973876953125, |
|
"learning_rate": 2.366212848176164e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8291666753590107, |
|
"reward_std": 0.12268693000078201, |
|
"rewards/accuracy_reward": 0.8291666753590107, |
|
"rewards/format_reward": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 580.6666816711426, |
|
"epoch": 0.384, |
|
"grad_norm": 0.39704873396377327, |
|
"kl": 0.008529281616210938, |
|
"learning_rate": 2.319928980510752e-06, |
|
"loss": 0.0003, |
|
"reward": 0.770833345502615, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.770833345502615, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 585.516682434082, |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 0.9521350378322919, |
|
"kl": 0.00994873046875, |
|
"learning_rate": 2.272503808643123e-06, |
|
"loss": 0.0004, |
|
"reward": 0.7375000108033418, |
|
"reward_std": 0.13712068647146225, |
|
"rewards/accuracy_reward": 0.7375000108033418, |
|
"rewards/format_reward": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 580.2583480834961, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 0.22517264586722138, |
|
"kl": 0.006447982788085937, |
|
"learning_rate": 2.2240033462759628e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7250000115483999, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.7250000115483999, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 526.1250137329101, |
|
"epoch": 0.416, |
|
"grad_norm": 0.3639815103811325, |
|
"kl": 0.010872650146484374, |
|
"learning_rate": 2.1744951038678905e-06, |
|
"loss": 0.0004, |
|
"reward": 0.8041666753590107, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.8041666753590107, |
|
"rewards/format_reward": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 602.2041851043701, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.2467674604552363, |
|
"kl": 0.007511520385742187, |
|
"learning_rate": 2.124047994661941e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7125000089406968, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.7125000089406968, |
|
"rewards/format_reward": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"eval_completion_length": 575.310859242822, |
|
"eval_kl": 0.007285342854299363, |
|
"eval_loss": 0.00030118186259642243, |
|
"eval_reward": 0.6825902547426285, |
|
"eval_reward_std": 0.16835552112312074, |
|
"eval_rewards/accuracy_reward": 0.6823248622144104, |
|
"eval_rewards/format_reward": 0.00026539278922566945, |
|
"eval_runtime": 2201.1944, |
|
"eval_samples_per_second": 2.271, |
|
"eval_steps_per_second": 0.024, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 567.8333488464356, |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 0.19466659798874456, |
|
"kl": 0.006165504455566406, |
|
"learning_rate": 2.072732238761434e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7625000100582838, |
|
"reward_std": 0.1515544429421425, |
|
"rewards/accuracy_reward": 0.7625000100582838, |
|
"rewards/format_reward": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 597.8791809082031, |
|
"epoch": 0.448, |
|
"grad_norm": 0.1973895783664286, |
|
"kl": 0.00566558837890625, |
|
"learning_rate": 2.0206192653867536e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7875000104308129, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.7875000104308129, |
|
"rewards/format_reward": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 567.9416831970215, |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 0.17114030974558328, |
|
"kl": 0.0057373046875, |
|
"learning_rate": 1.967781613449095e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7041666761040688, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.7041666761040688, |
|
"rewards/format_reward": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 576.6125190734863, |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 0.15616587500490686, |
|
"kl": 0.006618499755859375, |
|
"learning_rate": 1.9142928305795637e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7708333447575569, |
|
"reward_std": 0.16598819941282272, |
|
"rewards/accuracy_reward": 0.7708333447575569, |
|
"rewards/format_reward": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 572.6916816711425, |
|
"epoch": 0.48, |
|
"grad_norm": 0.16702488825594472, |
|
"kl": 0.004993247985839844, |
|
"learning_rate": 1.8602273707541886e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7916666757315397, |
|
"reward_std": 0.12268693000078201, |
|
"rewards/accuracy_reward": 0.7916666757315397, |
|
"rewards/format_reward": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 579.0583503723144, |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 0.21881388737259078, |
|
"kl": 0.031103515625, |
|
"learning_rate": 1.8056604906573418e-06, |
|
"loss": 0.0012, |
|
"reward": 0.7750000115483999, |
|
"reward_std": 0.1587713211774826, |
|
"rewards/accuracy_reward": 0.7750000115483999, |
|
"rewards/format_reward": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 597.841682434082, |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 0.23095788778773238, |
|
"kl": 0.004894065856933594, |
|
"learning_rate": 1.7506681449278226e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7750000078231096, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.7750000078231096, |
|
"rewards/format_reward": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 618.4500175476074, |
|
"epoch": 0.512, |
|
"grad_norm": 0.16218399606161332, |
|
"kl": 0.004921722412109375, |
|
"learning_rate": 1.6953268804334257e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7208333402872086, |
|
"reward_std": 0.1154700517654419, |
|
"rewards/accuracy_reward": 0.7208333402872086, |
|
"rewards/format_reward": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 577.145849609375, |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 0.2312126432153185, |
|
"kl": 0.005503082275390625, |
|
"learning_rate": 1.6397137297211436e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7458333425223828, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.7458333425223828, |
|
"rewards/format_reward": 0.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 535.725016784668, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.8329104997318444, |
|
"kl": 0.005479049682617187, |
|
"learning_rate": 1.5839061037913395e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7333333425223827, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.7333333425223827, |
|
"rewards/format_reward": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 562.9125198364258, |
|
"epoch": 0.544, |
|
"grad_norm": 0.7014962804091782, |
|
"kl": 0.00579681396484375, |
|
"learning_rate": 1.527981684345115e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7541666772216559, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.7541666772216559, |
|
"rewards/format_reward": 0.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 549.2666793823242, |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 0.14628604121619648, |
|
"kl": 0.005747222900390625, |
|
"learning_rate": 1.4720183156548855e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7916666757315397, |
|
"reward_std": 0.12268693000078201, |
|
"rewards/accuracy_reward": 0.7916666757315397, |
|
"rewards/format_reward": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 549.0625137329101, |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 0.22137684029876195, |
|
"kl": 0.006251907348632813, |
|
"learning_rate": 1.4160938962086612e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7666666731238365, |
|
"reward_std": 0.08660253882408142, |
|
"rewards/accuracy_reward": 0.7666666731238365, |
|
"rewards/format_reward": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 526.2583480834961, |
|
"epoch": 0.576, |
|
"grad_norm": 0.6919808405276313, |
|
"kl": 0.004993057250976563, |
|
"learning_rate": 1.3602862702788567e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7958333443850278, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.7958333443850278, |
|
"rewards/format_reward": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 623.1583526611328, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.4980813393645608, |
|
"kl": 0.004901885986328125, |
|
"learning_rate": 1.3046731195665748e-06, |
|
"loss": 0.0002, |
|
"reward": 0.700000013038516, |
|
"reward_std": 0.17320507764816284, |
|
"rewards/accuracy_reward": 0.700000013038516, |
|
"rewards/format_reward": 0.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 582.2666862487793, |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 0.15409228399304845, |
|
"kl": 0.004350852966308594, |
|
"learning_rate": 1.2493318550721775e-06, |
|
"loss": 0.0002, |
|
"reward": 0.762500011920929, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.762500011920929, |
|
"rewards/format_reward": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 535.845851135254, |
|
"epoch": 0.608, |
|
"grad_norm": 0.27378549806354485, |
|
"kl": 0.006243515014648438, |
|
"learning_rate": 1.1943395093426585e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7708333440124988, |
|
"reward_std": 0.12268693000078201, |
|
"rewards/accuracy_reward": 0.7708333440124988, |
|
"rewards/format_reward": 0.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 578.8583518981934, |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 0.24016865067136048, |
|
"kl": 0.005126190185546875, |
|
"learning_rate": 1.1397726292458115e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7458333447575569, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.7458333447575569, |
|
"rewards/format_reward": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 532.8083442687988, |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 0.3069882345554909, |
|
"kl": 0.006308746337890625, |
|
"learning_rate": 1.085707169420437e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8000000089406967, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.8000000089406967, |
|
"rewards/format_reward": 0.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 546.8958488464356, |
|
"epoch": 0.64, |
|
"grad_norm": 0.2504427887061209, |
|
"kl": 0.00545654296875, |
|
"learning_rate": 1.0322183865509054e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7958333477377891, |
|
"reward_std": 0.1515544429421425, |
|
"rewards/accuracy_reward": 0.7958333477377891, |
|
"rewards/format_reward": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_completion_length": 559.127672134691, |
|
"eval_kl": 0.0062117849945262735, |
|
"eval_loss": 0.0002487737510818988, |
|
"eval_reward": 0.6792064965910213, |
|
"eval_reward_std": 0.1668615815983077, |
|
"eval_rewards/accuracy_reward": 0.6792064965910213, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 2185.5304, |
|
"eval_samples_per_second": 2.288, |
|
"eval_steps_per_second": 0.024, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 552.7166816711426, |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 0.21206838418010487, |
|
"kl": 0.004500389099121094, |
|
"learning_rate": 9.793807346132464e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7666666798293591, |
|
"reward_std": 0.1515544429421425, |
|
"rewards/accuracy_reward": 0.7666666798293591, |
|
"rewards/format_reward": 0.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 547.9291816711426, |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 0.33095120969398945, |
|
"kl": 0.006579971313476563, |
|
"learning_rate": 9.272677612385667e-07, |
|
"loss": 0.0003, |
|
"reward": 0.7708333436399698, |
|
"reward_std": 0.1154700517654419, |
|
"rewards/accuracy_reward": 0.7708333436399698, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 577.8125190734863, |
|
"epoch": 0.672, |
|
"grad_norm": 0.221412944135324, |
|
"kl": 0.0048095703125, |
|
"learning_rate": 8.759520053380591e-07, |
|
"loss": 0.0002, |
|
"reward": 0.762500012293458, |
|
"reward_std": 0.18042195588350296, |
|
"rewards/accuracy_reward": 0.762500012293458, |
|
"rewards/format_reward": 0.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 613.1125186920166, |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 0.34456461165713703, |
|
"kl": 0.008467864990234376, |
|
"learning_rate": 8.255048961321088e-07, |
|
"loss": 0.0003, |
|
"reward": 0.7166666805744171, |
|
"reward_std": 0.17320507764816284, |
|
"rewards/accuracy_reward": 0.7166666805744171, |
|
"rewards/format_reward": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 592.1083511352539, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 0.1850685152795246, |
|
"kl": 0.00501251220703125, |
|
"learning_rate": 7.759966537240373e-07, |
|
"loss": 0.0002, |
|
"reward": 0.729166679084301, |
|
"reward_std": 0.13712068647146225, |
|
"rewards/accuracy_reward": 0.729166679084301, |
|
"rewards/format_reward": 0.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 573.8583511352539, |
|
"epoch": 0.704, |
|
"grad_norm": 0.11497658855387496, |
|
"kl": 0.007683181762695312, |
|
"learning_rate": 7.274961913568773e-07, |
|
"loss": 0.0003, |
|
"reward": 0.783333345502615, |
|
"reward_std": 0.17320507764816284, |
|
"rewards/accuracy_reward": 0.783333345502615, |
|
"rewards/format_reward": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 566.783351135254, |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 0.26449552613376137, |
|
"kl": 0.006775283813476562, |
|
"learning_rate": 6.800710194892484e-07, |
|
"loss": 0.0003, |
|
"reward": 0.8333333473652601, |
|
"reward_std": 0.13712068647146225, |
|
"rewards/accuracy_reward": 0.8333333473652601, |
|
"rewards/format_reward": 0.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 602.9125190734864, |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 0.14475377690275906, |
|
"kl": 0.004502487182617187, |
|
"learning_rate": 6.33787151823836e-07, |
|
"loss": 0.0002, |
|
"reward": 0.783333346247673, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.783333346247673, |
|
"rewards/format_reward": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 614.1708518981934, |
|
"epoch": 0.736, |
|
"grad_norm": 0.4505266593494607, |
|
"kl": 0.006034088134765625, |
|
"learning_rate": 5.887090134192947e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7000000108033418, |
|
"reward_std": 0.1587713211774826, |
|
"rewards/accuracy_reward": 0.7000000108033418, |
|
"rewards/format_reward": 0.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 563.0041870117187, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.2501643433234143, |
|
"kl": 0.00689239501953125, |
|
"learning_rate": 5.448993510134669e-07, |
|
"loss": 0.0003, |
|
"reward": 0.7625000067055225, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.7625000067055225, |
|
"rewards/format_reward": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 575.108349609375, |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 0.21546823633969947, |
|
"kl": 0.005526924133300781, |
|
"learning_rate": 5.024191456827498e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7958333477377891, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.7958333477377891, |
|
"rewards/format_reward": 0.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 544.3750190734863, |
|
"epoch": 0.768, |
|
"grad_norm": 0.26110346753809516, |
|
"kl": 0.0052356719970703125, |
|
"learning_rate": 4.6132752795918667e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7791666734963656, |
|
"reward_std": 0.09381941705942154, |
|
"rewards/accuracy_reward": 0.7791666734963656, |
|
"rewards/format_reward": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 560.4875164031982, |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 0.1476694523224307, |
|
"kl": 0.005467987060546875, |
|
"learning_rate": 4.2168169552342905e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8000000111758709, |
|
"reward_std": 0.09381941705942154, |
|
"rewards/accuracy_reward": 0.8000000111758709, |
|
"rewards/format_reward": 0.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 599.345849609375, |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 0.18228043856028503, |
|
"kl": 0.005198287963867188, |
|
"learning_rate": 3.8353683358814046e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7375000108033418, |
|
"reward_std": 0.1587713211774826, |
|
"rewards/accuracy_reward": 0.7375000108033418, |
|
"rewards/format_reward": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 593.5125167846679, |
|
"epoch": 0.8, |
|
"grad_norm": 0.10568026930273518, |
|
"kl": 0.005176544189453125, |
|
"learning_rate": 3.469460380826697e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7375000096857548, |
|
"reward_std": 0.12268693000078201, |
|
"rewards/accuracy_reward": 0.7375000096857548, |
|
"rewards/format_reward": 0.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 561.0041847229004, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 0.41189447336073276, |
|
"kl": 0.00511627197265625, |
|
"learning_rate": 3.119602417459075e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7666666768491268, |
|
"reward_std": 0.1587713211774826, |
|
"rewards/accuracy_reward": 0.7666666768491268, |
|
"rewards/format_reward": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 555.7875175476074, |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 0.12259595646219885, |
|
"kl": 0.0051021575927734375, |
|
"learning_rate": 2.786281432302071e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8333333414047956, |
|
"reward_std": 0.10103629529476166, |
|
"rewards/accuracy_reward": 0.8333333414047956, |
|
"rewards/format_reward": 0.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 566.9166854858398, |
|
"epoch": 0.832, |
|
"grad_norm": 0.12352458500062828, |
|
"kl": 0.00482635498046875, |
|
"learning_rate": 2.46996139315057e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7458333432674408, |
|
"reward_std": 0.1154700517654419, |
|
"rewards/accuracy_reward": 0.7458333432674408, |
|
"rewards/format_reward": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 599.9041847229004, |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 0.3200791222129482, |
|
"kl": 0.0046905517578125, |
|
"learning_rate": 2.1710826032485286e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7291666761040687, |
|
"reward_std": 0.1515544429421425, |
|
"rewards/accuracy_reward": 0.7291666761040687, |
|
"rewards/format_reward": 0.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 595.4541847229004, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.34590005976105703, |
|
"kl": 0.00473480224609375, |
|
"learning_rate": 1.8900610884066817e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7791666734963656, |
|
"reward_std": 0.09381941705942154, |
|
"rewards/accuracy_reward": 0.7791666734963656, |
|
"rewards/format_reward": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"eval_completion_length": 557.2634201535753, |
|
"eval_kl": 0.007186598079219745, |
|
"eval_loss": 0.0002872430195566267, |
|
"eval_reward": 0.6829219916064269, |
|
"eval_reward_std": 0.1668615815983077, |
|
"eval_rewards/accuracy_reward": 0.6829219916064269, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 2183.8414, |
|
"eval_samples_per_second": 2.29, |
|
"eval_steps_per_second": 0.024, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 573.129183959961, |
|
"epoch": 0.864, |
|
"grad_norm": 0.6768316752668614, |
|
"kl": 0.00518951416015625, |
|
"learning_rate": 1.627288017913383e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7958333414047957, |
|
"reward_std": 0.10103629529476166, |
|
"rewards/accuracy_reward": 0.7958333414047957, |
|
"rewards/format_reward": 0.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 549.708349609375, |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 0.3709687454137675, |
|
"kl": 0.005501937866210937, |
|
"learning_rate": 1.3831291600445573e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7625000089406967, |
|
"reward_std": 0.1587713211774826, |
|
"rewards/accuracy_reward": 0.7625000089406967, |
|
"rewards/format_reward": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 588.0458534240722, |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 0.3077767696291164, |
|
"kl": 0.005025482177734375, |
|
"learning_rate": 1.1579243729307487e-07, |
|
"loss": 0.0002, |
|
"reward": 0.725000013038516, |
|
"reward_std": 0.1587713211774826, |
|
"rewards/accuracy_reward": 0.725000013038516, |
|
"rewards/format_reward": 0.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 538.737515258789, |
|
"epoch": 0.896, |
|
"grad_norm": 0.23822267847271647, |
|
"kl": 0.00621185302734375, |
|
"learning_rate": 9.519871314899092e-08, |
|
"loss": 0.0002, |
|
"reward": 0.7791666775941849, |
|
"reward_std": 0.1515544429421425, |
|
"rewards/accuracy_reward": 0.7791666775941849, |
|
"rewards/format_reward": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 592.0875175476074, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 0.5349062347465572, |
|
"kl": 0.005745697021484375, |
|
"learning_rate": 7.656040910844358e-08, |
|
"loss": 0.0002, |
|
"reward": 0.7208333447575569, |
|
"reward_std": 0.14433756470680237, |
|
"rewards/accuracy_reward": 0.7208333447575569, |
|
"rewards/format_reward": 0.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 538.1250160217285, |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 0.29771658268540435, |
|
"kl": 0.00823822021484375, |
|
"learning_rate": 5.990346885098235e-08, |
|
"loss": 0.0003, |
|
"reward": 0.8416666742414236, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.8416666742414236, |
|
"rewards/format_reward": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 573.7250144958496, |
|
"epoch": 0.928, |
|
"grad_norm": 0.3246981835072228, |
|
"kl": 0.006461715698242188, |
|
"learning_rate": 4.5251078087033493e-08, |
|
"loss": 0.0003, |
|
"reward": 0.7958333436399698, |
|
"reward_std": 0.1515544429421425, |
|
"rewards/accuracy_reward": 0.7958333436399698, |
|
"rewards/format_reward": 0.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 578.0708473205566, |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 0.1985155350639712, |
|
"kl": 0.004383087158203125, |
|
"learning_rate": 3.262363228443427e-08, |
|
"loss": 0.0002, |
|
"reward": 0.8041666775941849, |
|
"reward_std": 0.13712068647146225, |
|
"rewards/accuracy_reward": 0.8041666775941849, |
|
"rewards/format_reward": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 550.4708488464355, |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 0.32873403114186156, |
|
"kl": 0.006027603149414062, |
|
"learning_rate": 2.2038708278862952e-08, |
|
"loss": 0.0002, |
|
"reward": 0.7458333421498538, |
|
"reward_std": 0.1154700517654419, |
|
"rewards/accuracy_reward": 0.7458333421498538, |
|
"rewards/format_reward": 0.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 536.8958450317383, |
|
"epoch": 0.96, |
|
"grad_norm": 0.26399382773089725, |
|
"kl": 0.0049957275390625, |
|
"learning_rate": 1.3511039807673209e-08, |
|
"loss": 0.0002, |
|
"reward": 0.8000000089406967, |
|
"reward_std": 0.12268693000078201, |
|
"rewards/accuracy_reward": 0.8000000089406967, |
|
"rewards/format_reward": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 554.4416831970215, |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 0.5214732249653611, |
|
"kl": 0.009600067138671875, |
|
"learning_rate": 7.0524970011963675e-09, |
|
"loss": 0.0004, |
|
"reward": 0.7791666768491268, |
|
"reward_std": 0.10103629529476166, |
|
"rewards/accuracy_reward": 0.7791666768491268, |
|
"rewards/format_reward": 0.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 574.6125175476075, |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 0.19467716795692694, |
|
"kl": 0.005005645751953125, |
|
"learning_rate": 2.6720698600553595e-09, |
|
"loss": 0.0002, |
|
"reward": 0.8083333469927311, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.8083333469927311, |
|
"rewards/format_reward": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 576.0375144958496, |
|
"epoch": 0.992, |
|
"grad_norm": 0.08977231270612557, |
|
"kl": 0.004978179931640625, |
|
"learning_rate": 3.7585574148779613e-10, |
|
"loss": 0.0002, |
|
"reward": 0.7625000100582838, |
|
"reward_std": 0.12990380823612213, |
|
"rewards/accuracy_reward": 0.7625000100582838, |
|
"rewards/format_reward": 0.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 581.3750178019205, |
|
"epoch": 0.9984, |
|
"kl": 0.0047486623128255205, |
|
"reward": 0.7361111218730608, |
|
"reward_std": 0.16839382549126944, |
|
"rewards/accuracy_reward": 0.7361111218730608, |
|
"rewards/format_reward": 0.0, |
|
"step": 468, |
|
"total_flos": 0.0, |
|
"train_loss": 1.2402063848306162, |
|
"train_runtime": 35030.8408, |
|
"train_samples_per_second": 0.214, |
|
"train_steps_per_second": 0.013 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 468, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|