Qwen-2.5-7B-Simple-RL / trainer_state.json
Maker-0409's picture
Model save
b1930f0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 100,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 633.2446681976319,
"epoch": 0.010666666666666666,
"grad_norm": 2.2443933486938477,
"kl": 0.00011417865753173828,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.0,
"reward": 1.138736367225647,
"reward_std": 0.8278621450066567,
"rewards/accuracy_reward": 0.5946428831666708,
"rewards/cosine_scaled_reward": 0.2899268216686323,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.25416668243706225,
"step": 5
},
{
"completion_length": 600.8857383728027,
"epoch": 0.021333333333333333,
"grad_norm": 5.001251220703125,
"kl": 0.00020779371261596679,
"learning_rate": 6.382978723404255e-07,
"loss": 0.0,
"reward": 1.2528822764754295,
"reward_std": 0.8592379853129387,
"rewards/accuracy_reward": 0.6553571775555611,
"rewards/cosine_scaled_reward": 0.34097747248015364,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.25654763616621495,
"step": 10
},
{
"completion_length": 601.8518112182617,
"epoch": 0.032,
"grad_norm": 3.453845500946045,
"kl": 0.00034580230712890627,
"learning_rate": 9.574468085106384e-07,
"loss": 0.0,
"reward": 1.2825960636138916,
"reward_std": 0.7762525148689747,
"rewards/accuracy_reward": 0.6642857484519482,
"rewards/cosine_scaled_reward": 0.3486674582702108,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.26964287189766767,
"step": 15
},
{
"completion_length": 620.7839553833007,
"epoch": 0.042666666666666665,
"grad_norm": 63.01131057739258,
"kl": 0.001246500015258789,
"learning_rate": 1.276595744680851e-06,
"loss": 0.0001,
"reward": 1.2914750523865224,
"reward_std": 0.7945833645761013,
"rewards/accuracy_reward": 0.6571428865194321,
"rewards/cosine_scaled_reward": 0.3593321413063677,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.2750000203028321,
"step": 20
},
{
"completion_length": 639.3946762084961,
"epoch": 0.05333333333333334,
"grad_norm": 1.1951252222061157,
"kl": 0.001938199996948242,
"learning_rate": 1.5957446808510639e-06,
"loss": 0.0001,
"reward": 1.2197763450443744,
"reward_std": 0.7964548453688621,
"rewards/accuracy_reward": 0.6285714630037547,
"rewards/cosine_scaled_reward": 0.323942980915308,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.2672619212418795,
"step": 25
},
{
"completion_length": 645.9482414245606,
"epoch": 0.064,
"grad_norm": 0.5322187542915344,
"kl": 0.0028698921203613283,
"learning_rate": 1.9148936170212767e-06,
"loss": 0.0001,
"reward": 1.34233574308455,
"reward_std": 0.7051636058837175,
"rewards/accuracy_reward": 0.6821428902447224,
"rewards/cosine_scaled_reward": 0.38400235488079487,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.2761904950253665,
"step": 30
},
{
"completion_length": 630.1071678161621,
"epoch": 0.07466666666666667,
"grad_norm": 0.686019241809845,
"kl": 0.00424489974975586,
"learning_rate": 2.2340425531914894e-06,
"loss": 0.0002,
"reward": 1.2706220560474322,
"reward_std": 0.7081292014569044,
"rewards/accuracy_reward": 0.6839286010712385,
"rewards/cosine_scaled_reward": 0.34145535016432405,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.2452381114475429,
"step": 35
},
{
"completion_length": 663.8464553833007,
"epoch": 0.08533333333333333,
"grad_norm": 10619385856.0,
"kl": 11324620.806011772,
"learning_rate": 2.553191489361702e-06,
"loss": 453134.65,
"reward": 1.4818414891138674,
"reward_std": 0.724718413501978,
"rewards/accuracy_reward": 0.7196428954601288,
"rewards/cosine_scaled_reward": 0.43124618427827954,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.3309524044394493,
"step": 40
},
{
"completion_length": 636.5178840637207,
"epoch": 0.096,
"grad_norm": 0.4083445370197296,
"kl": 0.1388763427734375,
"learning_rate": 2.872340425531915e-06,
"loss": 0.0055,
"reward": 1.5206772923469543,
"reward_std": 0.6890950493514538,
"rewards/accuracy_reward": 0.7428571715950966,
"rewards/cosine_scaled_reward": 0.4444867596961558,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.3333333550952375,
"step": 45
},
{
"completion_length": 624.0178833007812,
"epoch": 0.10666666666666667,
"grad_norm": 0.6491600275039673,
"kl": 0.014713478088378907,
"learning_rate": 2.9996241442585123e-06,
"loss": 0.0006,
"reward": 1.5073627218604089,
"reward_std": 0.7132997542619706,
"rewards/accuracy_reward": 0.712500025331974,
"rewards/cosine_scaled_reward": 0.41093407664448023,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.38392860516905786,
"step": 50
},
{
"completion_length": 631.5339569091797,
"epoch": 0.11733333333333333,
"grad_norm": 0.7147920727729797,
"kl": 0.007195663452148437,
"learning_rate": 2.9973279301399446e-06,
"loss": 0.0003,
"reward": 1.5377919152379036,
"reward_std": 0.76092077344656,
"rewards/accuracy_reward": 0.7232143200933934,
"rewards/cosine_scaled_reward": 0.4282680474221706,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.386309552192688,
"step": 55
},
{
"completion_length": 627.9214561462402,
"epoch": 0.128,
"grad_norm": 0.8942143321037292,
"kl": 0.008642578125,
"learning_rate": 2.992947502998804e-06,
"loss": 0.0003,
"reward": 1.6543699458241463,
"reward_std": 0.7264986954629421,
"rewards/accuracy_reward": 0.7214285999536514,
"rewards/cosine_scaled_reward": 0.40972703909501434,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.5232143249362707,
"step": 60
},
{
"completion_length": 633.0232421875,
"epoch": 0.13866666666666666,
"grad_norm": 6.921348571777344,
"kl": 0.01439208984375,
"learning_rate": 2.9864889601923268e-06,
"loss": 0.0006,
"reward": 1.7206872910261155,
"reward_std": 0.7344334974884987,
"rewards/accuracy_reward": 0.725000036507845,
"rewards/cosine_scaled_reward": 0.43497296012938025,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.5607143200933933,
"step": 65
},
{
"completion_length": 656.7178894042969,
"epoch": 0.14933333333333335,
"grad_norm": 0.6442045569419861,
"kl": 0.01673736572265625,
"learning_rate": 2.977961291721137e-06,
"loss": 0.0007,
"reward": 1.8801582887768746,
"reward_std": 0.7263622097671032,
"rewards/accuracy_reward": 0.7571428894996644,
"rewards/cosine_scaled_reward": 0.47301534870639445,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.6500000521540642,
"step": 70
},
{
"completion_length": 619.4536033630371,
"epoch": 0.16,
"grad_norm": 1.7239394187927246,
"kl": 0.026496124267578126,
"learning_rate": 2.9673763677155655e-06,
"loss": 0.0011,
"reward": 1.8051109313964844,
"reward_std": 0.7346500240266323,
"rewards/accuracy_reward": 0.7160714596509934,
"rewards/cosine_scaled_reward": 0.39439656864851713,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.6946429140865803,
"step": 75
},
{
"completion_length": 623.1785926818848,
"epoch": 0.17066666666666666,
"grad_norm": 0.6716666221618652,
"kl": 0.018997955322265624,
"learning_rate": 2.9547489219129666e-06,
"loss": 0.0008,
"reward": 1.9212585434317588,
"reward_std": 0.634969700500369,
"rewards/accuracy_reward": 0.7785714574158191,
"rewards/cosine_scaled_reward": 0.4653060721466318,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.6773809991776943,
"step": 80
},
{
"completion_length": 690.1518196105957,
"epoch": 0.18133333333333335,
"grad_norm": 1.1456305980682373,
"kl": 0.02204437255859375,
"learning_rate": 2.9400965311490175e-06,
"loss": 0.0009,
"reward": 1.9084690719842912,
"reward_std": 0.7263222638517618,
"rewards/accuracy_reward": 0.7303571783006191,
"rewards/cosine_scaled_reward": 0.4507309086387977,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.7273810178041458,
"step": 85
},
{
"completion_length": 650.4768188476562,
"epoch": 0.192,
"grad_norm": 29.814361572265625,
"kl": 0.078216552734375,
"learning_rate": 2.9234395908915565e-06,
"loss": 0.0031,
"reward": 1.8972563683986663,
"reward_std": 0.7165740359574556,
"rewards/accuracy_reward": 0.6875000324100256,
"rewards/cosine_scaled_reward": 0.4055896209087223,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8041667267680168,
"step": 90
},
{
"completion_length": 668.3339584350585,
"epoch": 0.20266666666666666,
"grad_norm": 0.48750847578048706,
"kl": 0.02767181396484375,
"learning_rate": 2.904801286851009e-06,
"loss": 0.0011,
"reward": 1.9524270623922348,
"reward_std": 0.6363851364701987,
"rewards/accuracy_reward": 0.7035714564844966,
"rewards/cosine_scaled_reward": 0.42206980669870975,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.826785783469677,
"step": 95
},
{
"completion_length": 645.9428840637207,
"epoch": 0.21333333333333335,
"grad_norm": 0.8315287232398987,
"kl": 0.02986602783203125,
"learning_rate": 2.884207562706925e-06,
"loss": 0.0012,
"reward": 2.0384097367525102,
"reward_std": 0.6786769151687622,
"rewards/accuracy_reward": 0.7517857387661934,
"rewards/cosine_scaled_reward": 0.4657905898289755,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.820833396166563,
"step": 100
},
{
"epoch": 0.21333333333333335,
"eval_completion_length": 688.0076597412109,
"eval_kl": 0.0332870361328125,
"eval_loss": 0.0013802805915474892,
"eval_reward": 1.86520801551342,
"eval_reward_std": 0.7114028903335333,
"eval_rewards/accuracy_reward": 0.650542886838317,
"eval_rewards/cosine_scaled_reward": 0.3737031816519331,
"eval_rewards/format_reward": 0.0,
"eval_rewards/reasoning_steps_reward": 0.8409619681358338,
"eval_runtime": 32350.4437,
"eval_samples_per_second": 0.155,
"eval_steps_per_second": 0.011,
"step": 100
},
{
"completion_length": 717.150033569336,
"epoch": 0.224,
"grad_norm": 1.5486549139022827,
"kl": 0.03196563720703125,
"learning_rate": 2.8616870839955444e-06,
"loss": 0.0013,
"reward": 2.0346583992242815,
"reward_std": 0.7014419212937355,
"rewards/accuracy_reward": 0.7232143215835094,
"rewards/cosine_scaled_reward": 0.457277343980968,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8541667237877846,
"step": 105
},
{
"completion_length": 708.8571708679199,
"epoch": 0.23466666666666666,
"grad_norm": 0.5981384515762329,
"kl": 0.02979583740234375,
"learning_rate": 2.837271198208662e-06,
"loss": 0.0012,
"reward": 2.0179374665021896,
"reward_std": 0.6652137346565723,
"rewards/accuracy_reward": 0.7250000320374965,
"rewards/cosine_scaled_reward": 0.47091358043253423,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8220238700509072,
"step": 110
},
{
"completion_length": 632.7732406616211,
"epoch": 0.24533333333333332,
"grad_norm": 0.7111315131187439,
"kl": 0.02539825439453125,
"learning_rate": 2.8109938911593322e-06,
"loss": 0.001,
"reward": 2.0148118153214454,
"reward_std": 0.6429756574332715,
"rewards/accuracy_reward": 0.728571455553174,
"rewards/cosine_scaled_reward": 0.44754982106387614,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8386905357241631,
"step": 115
},
{
"completion_length": 655.8321723937988,
"epoch": 0.256,
"grad_norm": 0.5316483974456787,
"kl": 0.02179107666015625,
"learning_rate": 2.7828917396751474e-06,
"loss": 0.0009,
"reward": 1.9900789648294448,
"reward_std": 0.6477071691304446,
"rewards/accuracy_reward": 0.7160714656114578,
"rewards/cosine_scaled_reward": 0.43412648113444446,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8398810118436814,
"step": 120
},
{
"completion_length": 644.7321693420411,
"epoch": 0.26666666666666666,
"grad_norm": 0.4458823800086975,
"kl": 0.025299072265625,
"learning_rate": 2.753003860684943e-06,
"loss": 0.001,
"reward": 2.1427780210971834,
"reward_std": 0.6711063630878925,
"rewards/accuracy_reward": 0.7750000268220901,
"rewards/cosine_scaled_reward": 0.5183731818571686,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8494048312306404,
"step": 125
},
{
"completion_length": 684.2911033630371,
"epoch": 0.2773333333333333,
"grad_norm": 0.7146270871162415,
"kl": 0.034222412109375,
"learning_rate": 2.721371856769793e-06,
"loss": 0.0014,
"reward": 1.9814838409423827,
"reward_std": 0.7353869907557964,
"rewards/accuracy_reward": 0.6625000331550837,
"rewards/cosine_scaled_reward": 0.3981504186260281,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9208333924412727,
"step": 130
},
{
"completion_length": 650.483960723877,
"epoch": 0.288,
"grad_norm": 0.8331003189086914,
"kl": 0.046978759765625,
"learning_rate": 2.688039758254093e-06,
"loss": 0.0019,
"reward": 2.223627084493637,
"reward_std": 0.6465678755193949,
"rewards/accuracy_reward": 0.7732143219560385,
"rewards/cosine_scaled_reward": 0.506960358901415,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.94345243871212,
"step": 135
},
{
"completion_length": 702.9536026000976,
"epoch": 0.2986666666666667,
"grad_norm": 1.9107334613800049,
"kl": 0.0536590576171875,
"learning_rate": 2.65305396191733e-06,
"loss": 0.0021,
"reward": 2.1239778250455856,
"reward_std": 0.6765143848955631,
"rewards/accuracy_reward": 0.7071428891271353,
"rewards/cosine_scaled_reward": 0.4555253505706787,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9613095715641975,
"step": 140
},
{
"completion_length": 733.6089630126953,
"epoch": 0.30933333333333335,
"grad_norm": 0.5300867557525635,
"kl": 0.05316162109375,
"learning_rate": 2.61646316641186e-06,
"loss": 0.0021,
"reward": 2.1554796636104583,
"reward_std": 0.6578622825443745,
"rewards/accuracy_reward": 0.7303571704775095,
"rewards/cosine_scaled_reward": 0.47036054339259864,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9547619551420212,
"step": 145
},
{
"completion_length": 713.221459197998,
"epoch": 0.32,
"grad_norm": 0.6026062369346619,
"kl": 0.0533843994140625,
"learning_rate": 2.5783183044765715e-06,
"loss": 0.0021,
"reward": 2.1126459658145906,
"reward_std": 0.5920085646212101,
"rewards/accuracy_reward": 0.7089285995811224,
"rewards/cosine_scaled_reward": 0.4566935421898961,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9470238655805587,
"step": 150
},
{
"completion_length": 678.6428886413574,
"epoch": 0.33066666666666666,
"grad_norm": 0.6598377227783203,
"kl": 0.049908447265625,
"learning_rate": 2.5386724720408135e-06,
"loss": 0.002,
"reward": 2.243595580756664,
"reward_std": 0.6088640403002501,
"rewards/accuracy_reward": 0.7767857441678643,
"rewards/cosine_scaled_reward": 0.5435954930260778,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9232143476605416,
"step": 155
},
{
"completion_length": 683.9268142700196,
"epoch": 0.3413333333333333,
"grad_norm": 0.6654959321022034,
"kl": 0.0447540283203125,
"learning_rate": 2.49758085431725e-06,
"loss": 0.0018,
"reward": 2.0952899247407912,
"reward_std": 0.6968366518616677,
"rewards/accuracy_reward": 0.7232143208384514,
"rewards/cosine_scaled_reward": 0.4637422326952219,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9083333939313889,
"step": 160
},
{
"completion_length": 691.3464614868165,
"epoch": 0.352,
"grad_norm": 0.689552903175354,
"kl": 0.0448211669921875,
"learning_rate": 2.455100648986533e-06,
"loss": 0.0018,
"reward": 2.0519487097859384,
"reward_std": 0.7221721112728119,
"rewards/accuracy_reward": 0.6964286031201482,
"rewards/cosine_scaled_reward": 0.4602819522842765,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8952381581068038,
"step": 165
},
{
"completion_length": 696.5268180847168,
"epoch": 0.3626666666666667,
"grad_norm": 1.0024878978729248,
"kl": 0.065167236328125,
"learning_rate": 2.4112909865807053e-06,
"loss": 0.0026,
"reward": 1.7887505039572715,
"reward_std": 0.7482936225831509,
"rewards/accuracy_reward": 0.571428600884974,
"rewards/cosine_scaled_reward": 0.3333932981360704,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8839286401867866,
"step": 170
},
{
"completion_length": 703.2714614868164,
"epoch": 0.37333333333333335,
"grad_norm": 0.5711168050765991,
"kl": 0.093731689453125,
"learning_rate": 2.366212848176164e-06,
"loss": 0.0037,
"reward": 1.9069189459085465,
"reward_std": 0.8069212771952152,
"rewards/accuracy_reward": 0.6500000327825546,
"rewards/cosine_scaled_reward": 0.42358550764620306,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8333333879709244,
"step": 175
},
{
"completion_length": 714.2536003112793,
"epoch": 0.384,
"grad_norm": 3.1069464683532715,
"kl": 0.1747802734375,
"learning_rate": 2.319928980510752e-06,
"loss": 0.007,
"reward": 1.6917703241109847,
"reward_std": 0.8836216881871224,
"rewards/accuracy_reward": 0.6089285977184773,
"rewards/cosine_scaled_reward": 0.35307975246978457,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.7297619581222534,
"step": 180
},
{
"completion_length": 727.7018188476562,
"epoch": 0.39466666666666667,
"grad_norm": 1.1932159662246704,
"kl": 0.193988037109375,
"learning_rate": 2.272503808643123e-06,
"loss": 0.0078,
"reward": 1.7027929693460464,
"reward_std": 0.7921728197485208,
"rewards/accuracy_reward": 0.6267857421189547,
"rewards/cosine_scaled_reward": 0.3605310095474124,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.7154762461781502,
"step": 185
},
{
"completion_length": 677.6518127441407,
"epoch": 0.4053333333333333,
"grad_norm": 0.6525413393974304,
"kl": 0.1227813720703125,
"learning_rate": 2.2240033462759628e-06,
"loss": 0.0049,
"reward": 2.055608908832073,
"reward_std": 0.6409808352589608,
"rewards/accuracy_reward": 0.7428571667522192,
"rewards/cosine_scaled_reward": 0.4907278836122714,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8220238700509072,
"step": 190
},
{
"completion_length": 729.3125358581543,
"epoch": 0.416,
"grad_norm": 0.470821738243103,
"kl": 0.1053009033203125,
"learning_rate": 2.1744951038678905e-06,
"loss": 0.0042,
"reward": 2.1352262631058694,
"reward_std": 0.6541992913931608,
"rewards/accuracy_reward": 0.7446428880095481,
"rewards/cosine_scaled_reward": 0.5340357202105224,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8565476804971695,
"step": 195
},
{
"completion_length": 736.6607482910156,
"epoch": 0.4266666666666667,
"grad_norm": 0.3663829267024994,
"kl": 0.145220947265625,
"learning_rate": 2.124047994661941e-06,
"loss": 0.0058,
"reward": 2.0683016672730448,
"reward_std": 0.6785697277635336,
"rewards/accuracy_reward": 0.7107143150642514,
"rewards/cosine_scaled_reward": 0.4861587251536548,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8714286342263222,
"step": 200
},
{
"epoch": 0.4266666666666667,
"eval_completion_length": 743.3604330322265,
"eval_kl": 0.1699279296875,
"eval_loss": 0.006734147202223539,
"eval_reward": 1.8947704853653908,
"eval_reward_std": 0.7092250557422638,
"eval_rewards/accuracy_reward": 0.6307143133163452,
"eval_rewards/cosine_scaled_reward": 0.39257041423644407,
"eval_rewards/format_reward": 0.0,
"eval_rewards/reasoning_steps_reward": 0.871485775399208,
"eval_runtime": 32670.592,
"eval_samples_per_second": 0.153,
"eval_steps_per_second": 0.011,
"step": 200
},
{
"completion_length": 752.7053955078125,
"epoch": 0.43733333333333335,
"grad_norm": 0.5299625396728516,
"kl": 0.1930633544921875,
"learning_rate": 2.072732238761434e-06,
"loss": 0.0077,
"reward": 1.8860187515616418,
"reward_std": 0.7606242794543505,
"rewards/accuracy_reward": 0.6446428863331676,
"rewards/cosine_scaled_reward": 0.40447108587541153,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8369048193097115,
"step": 205
},
{
"completion_length": 733.603606414795,
"epoch": 0.448,
"grad_norm": 1.6152819395065308,
"kl": 0.219268798828125,
"learning_rate": 2.0206192653867536e-06,
"loss": 0.0088,
"reward": 1.997245892137289,
"reward_std": 0.7402419943362475,
"rewards/accuracy_reward": 0.7017857382073999,
"rewards/cosine_scaled_reward": 0.47284105569124224,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8226191058754921,
"step": 210
},
{
"completion_length": 844.0661102294922,
"epoch": 0.45866666666666667,
"grad_norm": 7.516280651092529,
"kl": 0.27982177734375,
"learning_rate": 1.967781613449095e-06,
"loss": 0.0112,
"reward": 1.5464881896972655,
"reward_std": 0.8091491930186748,
"rewards/accuracy_reward": 0.49107144959270954,
"rewards/cosine_scaled_reward": 0.21672622584737838,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8386905357241631,
"step": 215
},
{
"completion_length": 814.1696807861329,
"epoch": 0.4693333333333333,
"grad_norm": 0.4684678018093109,
"kl": 0.194140625,
"learning_rate": 1.9142928305795637e-06,
"loss": 0.0078,
"reward": 1.8477135568857193,
"reward_std": 0.7414120733737946,
"rewards/accuracy_reward": 0.6178571652621031,
"rewards/cosine_scaled_reward": 0.3584277655696496,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8714286401867867,
"step": 220
},
{
"completion_length": 754.1857452392578,
"epoch": 0.48,
"grad_norm": 0.4328997731208801,
"kl": 0.12838134765625,
"learning_rate": 1.8602273707541886e-06,
"loss": 0.0051,
"reward": 2.1135876968503,
"reward_std": 0.6965163860470056,
"rewards/accuracy_reward": 0.742857176810503,
"rewards/cosine_scaled_reward": 0.5159685641527176,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8547619715332985,
"step": 225
},
{
"completion_length": 742.7750381469726,
"epoch": 0.49066666666666664,
"grad_norm": 0.4649052619934082,
"kl": 0.1558837890625,
"learning_rate": 1.8056604906573418e-06,
"loss": 0.0062,
"reward": 2.0384344711899756,
"reward_std": 0.6620127268135547,
"rewards/accuracy_reward": 0.7035714626312256,
"rewards/cosine_scaled_reward": 0.483077246020548,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8517857760190963,
"step": 230
},
{
"completion_length": 739.6268203735351,
"epoch": 0.5013333333333333,
"grad_norm": 1.5264660120010376,
"kl": 0.145806884765625,
"learning_rate": 1.7506681449278226e-06,
"loss": 0.0058,
"reward": 1.999456986784935,
"reward_std": 0.7032103724777699,
"rewards/accuracy_reward": 0.6785714574158191,
"rewards/cosine_scaled_reward": 0.45302835907787087,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8678572103381157,
"step": 235
},
{
"completion_length": 725.905387878418,
"epoch": 0.512,
"grad_norm": 13.703657150268555,
"kl": 0.354132080078125,
"learning_rate": 1.6953268804334257e-06,
"loss": 0.0142,
"reward": 2.012031316757202,
"reward_std": 0.6349152896553278,
"rewards/accuracy_reward": 0.6660714553669095,
"rewards/cosine_scaled_reward": 0.46024551438167693,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8857143551111222,
"step": 240
},
{
"completion_length": 711.9410980224609,
"epoch": 0.5226666666666666,
"grad_norm": 42.922752380371094,
"kl": 0.81356201171875,
"learning_rate": 1.6397137297211436e-06,
"loss": 0.0325,
"reward": 2.129089578986168,
"reward_std": 0.699107101932168,
"rewards/accuracy_reward": 0.7160714577883482,
"rewards/cosine_scaled_reward": 0.5064704709046055,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9065476730465889,
"step": 245
},
{
"completion_length": 738.9821746826171,
"epoch": 0.5333333333333333,
"grad_norm": 212.6622314453125,
"kl": 1.157550048828125,
"learning_rate": 1.5839061037913395e-06,
"loss": 0.0463,
"reward": 2.1009622782468798,
"reward_std": 0.7158728931099176,
"rewards/accuracy_reward": 0.7000000283122063,
"rewards/cosine_scaled_reward": 0.5027479250915349,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8982143506407738,
"step": 250
},
{
"completion_length": 760.2428916931152,
"epoch": 0.544,
"grad_norm": 10.118670463562012,
"kl": 0.637158203125,
"learning_rate": 1.527981684345115e-06,
"loss": 0.0255,
"reward": 1.9621681660413741,
"reward_std": 0.67494813259691,
"rewards/accuracy_reward": 0.639285740070045,
"rewards/cosine_scaled_reward": 0.4276442806003615,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8952381491661072,
"step": 255
},
{
"completion_length": 754.6803894042969,
"epoch": 0.5546666666666666,
"grad_norm": 7.878048419952393,
"kl": 0.972845458984375,
"learning_rate": 1.4720183156548855e-06,
"loss": 0.0389,
"reward": 1.9780788227915764,
"reward_std": 0.6262619759887457,
"rewards/accuracy_reward": 0.6339285982772708,
"rewards/cosine_scaled_reward": 0.4304597085807472,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9136905416846275,
"step": 260
},
{
"completion_length": 751.5857498168946,
"epoch": 0.5653333333333334,
"grad_norm": 12.42583179473877,
"kl": 3.09744873046875,
"learning_rate": 1.4160938962086612e-06,
"loss": 0.1241,
"reward": 2.0433208346366882,
"reward_std": 0.661328698694706,
"rewards/accuracy_reward": 0.676785740442574,
"rewards/cosine_scaled_reward": 0.44689220561413096,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9196429163217544,
"step": 265
},
{
"completion_length": 729.028604888916,
"epoch": 0.576,
"grad_norm": 7.453009605407715,
"kl": 2.2955322265625,
"learning_rate": 1.3602862702788567e-06,
"loss": 0.0917,
"reward": 2.094664843380451,
"reward_std": 0.6356621380895376,
"rewards/accuracy_reward": 0.7000000346451998,
"rewards/cosine_scaled_reward": 0.46371242445893585,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9309524431824684,
"step": 270
},
{
"completion_length": 730.825032043457,
"epoch": 0.5866666666666667,
"grad_norm": 7.0367817878723145,
"kl": 0.6509521484375,
"learning_rate": 1.3046731195665748e-06,
"loss": 0.0261,
"reward": 2.083331751823425,
"reward_std": 0.6676435235887765,
"rewards/accuracy_reward": 0.6821428818628192,
"rewards/cosine_scaled_reward": 0.45714118536561726,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.944047674536705,
"step": 275
},
{
"completion_length": 742.180387878418,
"epoch": 0.5973333333333334,
"grad_norm": 1.3236949443817139,
"kl": 4.09298095703125,
"learning_rate": 1.2493318550721775e-06,
"loss": 0.1637,
"reward": 2.075996032357216,
"reward_std": 0.6379393456503749,
"rewards/accuracy_reward": 0.6857143174856901,
"rewards/cosine_scaled_reward": 0.4563530746847391,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9339286297559738,
"step": 280
},
{
"completion_length": 708.1018157958985,
"epoch": 0.608,
"grad_norm": 5.264936447143555,
"kl": 0.21192626953125,
"learning_rate": 1.1943395093426585e-06,
"loss": 0.0085,
"reward": 2.1390477627515794,
"reward_std": 0.600306774303317,
"rewards/accuracy_reward": 0.7196428820490837,
"rewards/cosine_scaled_reward": 0.49619057439267633,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9232143506407737,
"step": 285
},
{
"completion_length": 715.4125289916992,
"epoch": 0.6186666666666667,
"grad_norm": 2.6887574195861816,
"kl": 2.8669677734375,
"learning_rate": 1.1397726292458115e-06,
"loss": 0.1151,
"reward": 2.1179503470659258,
"reward_std": 0.5490788316354156,
"rewards/accuracy_reward": 0.7053571708500386,
"rewards/cosine_scaled_reward": 0.4905693273060024,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9220238789916039,
"step": 290
},
{
"completion_length": 742.6803916931152,
"epoch": 0.6293333333333333,
"grad_norm": 6.9418721199035645,
"kl": 0.39151611328125,
"learning_rate": 1.085707169420437e-06,
"loss": 0.0157,
"reward": 1.8962592497467994,
"reward_std": 0.6060247957706452,
"rewards/accuracy_reward": 0.5964285938069225,
"rewards/cosine_scaled_reward": 0.3754258565604687,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.924404813349247,
"step": 295
},
{
"completion_length": 716.3464584350586,
"epoch": 0.64,
"grad_norm": 4.2906060218811035,
"kl": 0.57667236328125,
"learning_rate": 1.0322183865509054e-06,
"loss": 0.0231,
"reward": 2.1815308302640917,
"reward_std": 0.6235232371836901,
"rewards/accuracy_reward": 0.7428571732714773,
"rewards/cosine_scaled_reward": 0.5255783690838143,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.913095298409462,
"step": 300
},
{
"epoch": 0.64,
"eval_completion_length": 728.9849459716797,
"eval_kl": 22.31169453125,
"eval_loss": 0.8926114439964294,
"eval_reward": 1.9843467233777046,
"eval_reward_std": 0.6538388645738363,
"eval_rewards/accuracy_reward": 0.6382285982251167,
"eval_rewards/cosine_scaled_reward": 0.41530855364510644,
"eval_rewards/format_reward": 0.0,
"eval_rewards/reasoning_steps_reward": 0.9308095807313919,
"eval_runtime": 32207.7986,
"eval_samples_per_second": 0.155,
"eval_steps_per_second": 0.011,
"step": 300
},
{
"completion_length": 723.2625328063965,
"epoch": 0.6506666666666666,
"grad_norm": 79.97950744628906,
"kl": 487.1179443359375,
"learning_rate": 9.793807346132464e-07,
"loss": 19.4474,
"reward": 2.162437987327576,
"reward_std": 0.6324797321110964,
"rewards/accuracy_reward": 0.7267857410013676,
"rewards/cosine_scaled_reward": 0.5112474345514784,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9244048178195954,
"step": 305
},
{
"completion_length": 739.6375335693359,
"epoch": 0.6613333333333333,
"grad_norm": 9.395992279052734,
"kl": 0.60579833984375,
"learning_rate": 9.272677612385667e-07,
"loss": 0.0242,
"reward": 2.004467612504959,
"reward_std": 0.6282935816794634,
"rewards/accuracy_reward": 0.6607143184170127,
"rewards/cosine_scaled_reward": 0.42589613443706187,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9178571999073029,
"step": 310
},
{
"completion_length": 735.6286071777344,
"epoch": 0.672,
"grad_norm": 12.830111503601074,
"kl": 0.9565673828125,
"learning_rate": 8.759520053380591e-07,
"loss": 0.0383,
"reward": 1.9197196617722512,
"reward_std": 0.6299623921513557,
"rewards/accuracy_reward": 0.6035714576020836,
"rewards/cosine_scaled_reward": 0.39055290608666837,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9255953043699264,
"step": 315
},
{
"completion_length": 718.0571731567383,
"epoch": 0.6826666666666666,
"grad_norm": 176.6972198486328,
"kl": 1.54287109375,
"learning_rate": 8.255048961321088e-07,
"loss": 0.0618,
"reward": 2.1281729131937026,
"reward_std": 0.6808584026992321,
"rewards/accuracy_reward": 0.714285746216774,
"rewards/cosine_scaled_reward": 0.4888871216215193,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9250000536441803,
"step": 320
},
{
"completion_length": 721.4732475280762,
"epoch": 0.6933333333333334,
"grad_norm": 6.025720119476318,
"kl": 0.98104248046875,
"learning_rate": 7.759966537240373e-07,
"loss": 0.0392,
"reward": 2.054315000772476,
"reward_std": 0.6834255807101727,
"rewards/accuracy_reward": 0.6714285992085933,
"rewards/cosine_scaled_reward": 0.45312447142787277,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9297619640827179,
"step": 325
},
{
"completion_length": 729.3982498168946,
"epoch": 0.704,
"grad_norm": 6.682721138000488,
"kl": 2.40982666015625,
"learning_rate": 7.274961913568773e-07,
"loss": 0.0964,
"reward": 2.0376005843281746,
"reward_std": 0.7055317234247923,
"rewards/accuracy_reward": 0.6660714562982321,
"rewards/cosine_scaled_reward": 0.4655766852200031,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9059524461627007,
"step": 330
},
{
"completion_length": 737.005387878418,
"epoch": 0.7146666666666667,
"grad_norm": 21.818754196166992,
"kl": 0.653094482421875,
"learning_rate": 6.800710194892484e-07,
"loss": 0.0261,
"reward": 2.056803268194199,
"reward_std": 0.7108213260769844,
"rewards/accuracy_reward": 0.6660714574158192,
"rewards/cosine_scaled_reward": 0.45680318772792816,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9339286327362061,
"step": 335
},
{
"completion_length": 729.6393203735352,
"epoch": 0.7253333333333334,
"grad_norm": 4.025352954864502,
"kl": 0.63848876953125,
"learning_rate": 6.33787151823836e-07,
"loss": 0.0256,
"reward": 1.9720933943986894,
"reward_std": 0.6898978160694241,
"rewards/accuracy_reward": 0.6250000264495611,
"rewards/cosine_scaled_reward": 0.42685523356776683,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9202381581068039,
"step": 340
},
{
"completion_length": 699.1571701049804,
"epoch": 0.736,
"grad_norm": 5.142830848693848,
"kl": 0.65721435546875,
"learning_rate": 5.887090134192947e-07,
"loss": 0.0263,
"reward": 2.100009024143219,
"reward_std": 0.6496724892407656,
"rewards/accuracy_reward": 0.6910714615136385,
"rewards/cosine_scaled_reward": 0.4851280112750828,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9238095805048943,
"step": 345
},
{
"completion_length": 723.3910995483399,
"epoch": 0.7466666666666667,
"grad_norm": 4.602946758270264,
"kl": 0.394140625,
"learning_rate": 5.448993510134669e-07,
"loss": 0.0158,
"reward": 2.0926264360547067,
"reward_std": 0.6916316740214825,
"rewards/accuracy_reward": 0.6857143180444837,
"rewards/cosine_scaled_reward": 0.4831025514518842,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9238095790147781,
"step": 350
},
{
"completion_length": 722.5375305175781,
"epoch": 0.7573333333333333,
"grad_norm": 6.0756731033325195,
"kl": 1.08592529296875,
"learning_rate": 5.024191456827498e-07,
"loss": 0.0435,
"reward": 2.0994770556688307,
"reward_std": 0.666194306127727,
"rewards/accuracy_reward": 0.6982143167406321,
"rewards/cosine_scaled_reward": 0.4917388891801238,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9095238700509072,
"step": 355
},
{
"completion_length": 713.5250350952149,
"epoch": 0.768,
"grad_norm": 7.16264533996582,
"kl": 26.27894287109375,
"learning_rate": 4.6132752795918667e-07,
"loss": 1.0497,
"reward": 2.055359125137329,
"reward_std": 0.7066416556015611,
"rewards/accuracy_reward": 0.6678571753203869,
"rewards/cosine_scaled_reward": 0.4732161985710263,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9142857760190963,
"step": 360
},
{
"completion_length": 751.5964584350586,
"epoch": 0.7786666666666666,
"grad_norm": 3.023808002471924,
"kl": 1.154327392578125,
"learning_rate": 4.2168169552342905e-07,
"loss": 0.0462,
"reward": 1.9766315311193465,
"reward_std": 0.7433438140898943,
"rewards/accuracy_reward": 0.6339286021888256,
"rewards/cosine_scaled_reward": 0.42544099894585086,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9172619596123696,
"step": 365
},
{
"completion_length": 704.278596496582,
"epoch": 0.7893333333333333,
"grad_norm": 1.0741926431655884,
"kl": 0.53934326171875,
"learning_rate": 3.8353683358814046e-07,
"loss": 0.0216,
"reward": 2.0491741001605988,
"reward_std": 0.587555892020464,
"rewards/accuracy_reward": 0.6678571693599225,
"rewards/cosine_scaled_reward": 0.46226926781237127,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9190476790070534,
"step": 370
},
{
"completion_length": 738.875033569336,
"epoch": 0.8,
"grad_norm": 41.52888870239258,
"kl": 0.6643310546875,
"learning_rate": 3.469460380826697e-07,
"loss": 0.0265,
"reward": 2.0449665546417237,
"reward_std": 0.6989724855870009,
"rewards/accuracy_reward": 0.6625000312924385,
"rewards/cosine_scaled_reward": 0.4574664521496743,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9250000640749931,
"step": 375
},
{
"completion_length": 724.0678855895997,
"epoch": 0.8106666666666666,
"grad_norm": 4.322193145751953,
"kl": 0.7086669921875,
"learning_rate": 3.119602417459075e-07,
"loss": 0.0284,
"reward": 2.055614770948887,
"reward_std": 0.6039443843066692,
"rewards/accuracy_reward": 0.667857171408832,
"rewards/cosine_scaled_reward": 0.46275755076203495,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9250000655651093,
"step": 380
},
{
"completion_length": 739.5125350952148,
"epoch": 0.8213333333333334,
"grad_norm": 4.056361198425293,
"kl": 0.7447509765625,
"learning_rate": 2.786281432302071e-07,
"loss": 0.0298,
"reward": 2.0523035705089567,
"reward_std": 0.6267267379909753,
"rewards/accuracy_reward": 0.6750000279396773,
"rewards/cosine_scaled_reward": 0.4463511134439614,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9309524476528168,
"step": 385
},
{
"completion_length": 722.548243713379,
"epoch": 0.832,
"grad_norm": 1.378568410873413,
"kl": 0.501007080078125,
"learning_rate": 2.46996139315057e-07,
"loss": 0.02,
"reward": 2.0793206453323365,
"reward_std": 0.6533296214416623,
"rewards/accuracy_reward": 0.6875000290572644,
"rewards/cosine_scaled_reward": 0.4781300783797633,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9136905401945115,
"step": 390
},
{
"completion_length": 732.4714630126953,
"epoch": 0.8426666666666667,
"grad_norm": 2.4824626445770264,
"kl": 0.71015625,
"learning_rate": 2.1710826032485286e-07,
"loss": 0.0284,
"reward": 2.1464335188269614,
"reward_std": 0.6267410140484572,
"rewards/accuracy_reward": 0.7071428874507546,
"rewards/cosine_scaled_reward": 0.5136953465640545,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9255953013896943,
"step": 395
},
{
"completion_length": 769.8607528686523,
"epoch": 0.8533333333333334,
"grad_norm": 5.1279401779174805,
"kl": 0.787158203125,
"learning_rate": 1.8900610884066817e-07,
"loss": 0.0315,
"reward": 1.9811220198869706,
"reward_std": 0.6900037627667188,
"rewards/accuracy_reward": 0.6357143126428128,
"rewards/cosine_scaled_reward": 0.4329076783033088,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.912500062584877,
"step": 400
},
{
"epoch": 0.8533333333333334,
"eval_completion_length": 738.6478045166016,
"eval_kl": 0.67065634765625,
"eval_loss": 0.026821324601769447,
"eval_reward": 1.9358687758922577,
"eval_reward_std": 0.681571420711279,
"eval_rewards/accuracy_reward": 0.6160857413113118,
"eval_rewards/cosine_scaled_reward": 0.4032782297934056,
"eval_rewards/format_reward": 0.0,
"eval_rewards/reasoning_steps_reward": 0.9165048221349716,
"eval_runtime": 32285.4404,
"eval_samples_per_second": 0.155,
"eval_steps_per_second": 0.011,
"step": 400
},
{
"completion_length": 763.4339599609375,
"epoch": 0.864,
"grad_norm": 4.143102169036865,
"kl": 0.609136962890625,
"learning_rate": 1.627288017913383e-07,
"loss": 0.0244,
"reward": 1.9788720414042473,
"reward_std": 0.6925495602190495,
"rewards/accuracy_reward": 0.6375000275671482,
"rewards/cosine_scaled_reward": 0.42411007191985844,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9172619670629502,
"step": 405
},
{
"completion_length": 754.2500312805175,
"epoch": 0.8746666666666667,
"grad_norm": 4.33268928527832,
"kl": 0.9586181640625,
"learning_rate": 1.3831291600445573e-07,
"loss": 0.0383,
"reward": 1.9650759071111679,
"reward_std": 0.6423604141920805,
"rewards/accuracy_reward": 0.6303571704775095,
"rewards/cosine_scaled_reward": 0.4222186904400587,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.912500062584877,
"step": 410
},
{
"completion_length": 751.8071762084961,
"epoch": 0.8853333333333333,
"grad_norm": 7.097233295440674,
"kl": 0.8556884765625,
"learning_rate": 1.1579243729307487e-07,
"loss": 0.0342,
"reward": 1.9338065341114998,
"reward_std": 0.7414230849593878,
"rewards/accuracy_reward": 0.6321428898721934,
"rewards/cosine_scaled_reward": 0.41178265907801687,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8898810192942619,
"step": 415
},
{
"completion_length": 752.8571723937988,
"epoch": 0.896,
"grad_norm": 3.0274124145507812,
"kl": 0.67294921875,
"learning_rate": 9.519871314899092e-08,
"loss": 0.0269,
"reward": 1.9913182631134987,
"reward_std": 0.7086525153368711,
"rewards/accuracy_reward": 0.6571428876370191,
"rewards/cosine_scaled_reward": 0.4359610580140725,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8982143491506577,
"step": 420
},
{
"completion_length": 751.7411056518555,
"epoch": 0.9066666666666666,
"grad_norm": 1.3194289207458496,
"kl": 0.722802734375,
"learning_rate": 7.656040910844358e-08,
"loss": 0.0289,
"reward": 2.0188252568244933,
"reward_std": 0.7707155652344226,
"rewards/accuracy_reward": 0.644642885029316,
"rewards/cosine_scaled_reward": 0.44144419142976404,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9327381521463394,
"step": 425
},
{
"completion_length": 755.0464630126953,
"epoch": 0.9173333333333333,
"grad_norm": 4.276956081390381,
"kl": 0.9569580078125,
"learning_rate": 5.990346885098235e-08,
"loss": 0.0383,
"reward": 2.000167742371559,
"reward_std": 0.7376608021557332,
"rewards/accuracy_reward": 0.6589285988360644,
"rewards/cosine_scaled_reward": 0.45314384531229734,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8880952954292297,
"step": 430
},
{
"completion_length": 727.2536087036133,
"epoch": 0.928,
"grad_norm": 19.139204025268555,
"kl": 1.32947998046875,
"learning_rate": 4.5251078087033493e-08,
"loss": 0.0532,
"reward": 2.039694218337536,
"reward_std": 0.6533694989979267,
"rewards/accuracy_reward": 0.6732143165543676,
"rewards/cosine_scaled_reward": 0.4462417368311435,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9202381521463394,
"step": 435
},
{
"completion_length": 734.5536064147949,
"epoch": 0.9386666666666666,
"grad_norm": 9.922527313232422,
"kl": 1.4177001953125,
"learning_rate": 3.262363228443427e-08,
"loss": 0.0567,
"reward": 1.9774114236235618,
"reward_std": 0.7198221303522587,
"rewards/accuracy_reward": 0.6571428865194321,
"rewards/cosine_scaled_reward": 0.4309827778954059,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.8892857789993286,
"step": 440
},
{
"completion_length": 755.5053962707519,
"epoch": 0.9493333333333334,
"grad_norm": 3.058717727661133,
"kl": 1.02747802734375,
"learning_rate": 2.2038708278862952e-08,
"loss": 0.0411,
"reward": 1.9413904681801797,
"reward_std": 0.6192027345299721,
"rewards/accuracy_reward": 0.6214285951107741,
"rewards/cosine_scaled_reward": 0.41579519272781906,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9041667267680168,
"step": 445
},
{
"completion_length": 723.6143173217773,
"epoch": 0.96,
"grad_norm": 2.64345383644104,
"kl": 0.74544677734375,
"learning_rate": 1.3511039807673209e-08,
"loss": 0.0298,
"reward": 2.1570381984114646,
"reward_std": 0.6153812855482101,
"rewards/accuracy_reward": 0.7089286003261804,
"rewards/cosine_scaled_reward": 0.5165619559586048,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9315476790070534,
"step": 450
},
{
"completion_length": 728.894679260254,
"epoch": 0.9706666666666667,
"grad_norm": 2.217505693435669,
"kl": 0.678607177734375,
"learning_rate": 7.0524970011963675e-09,
"loss": 0.0272,
"reward": 2.2157696574926375,
"reward_std": 0.6317826233804226,
"rewards/accuracy_reward": 0.7500000305473804,
"rewards/cosine_scaled_reward": 0.5425553207285703,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9232143491506577,
"step": 455
},
{
"completion_length": 722.3839637756348,
"epoch": 0.9813333333333333,
"grad_norm": 3.196773052215576,
"kl": 0.709228515625,
"learning_rate": 2.6720698600553595e-09,
"loss": 0.0284,
"reward": 2.122882993519306,
"reward_std": 0.599827627837658,
"rewards/accuracy_reward": 0.7017857432365417,
"rewards/cosine_scaled_reward": 0.5175257750786841,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9035714983940124,
"step": 460
},
{
"completion_length": 754.775032043457,
"epoch": 0.992,
"grad_norm": 8.455827713012695,
"kl": 0.835205078125,
"learning_rate": 3.7585574148779613e-10,
"loss": 0.0334,
"reward": 1.9985675051808358,
"reward_std": 0.7642196819186211,
"rewards/accuracy_reward": 0.6500000316649676,
"rewards/cosine_scaled_reward": 0.4402340850589098,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.9083333894610405,
"step": 465
},
{
"completion_length": 746.1607462565104,
"epoch": 0.9984,
"kl": 0.8069661458333334,
"reward": 2.0161508160332837,
"reward_std": 0.7148686709503332,
"rewards/accuracy_reward": 0.6636905111372471,
"rewards/cosine_scaled_reward": 0.4536507367156446,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.898809589445591,
"step": 468,
"total_flos": 0.0,
"train_loss": 4841.422249500714,
"train_runtime": 180396.3107,
"train_samples_per_second": 0.042,
"train_steps_per_second": 0.003
}
],
"logging_steps": 5,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}