|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 10, |
|
"global_step": 1125, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 440.27813415527345, |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 0.35731378197669983, |
|
"kl": 7.561445236206054e-05, |
|
"learning_rate": 2.654867256637168e-07, |
|
"loss": 0.0, |
|
"reward": 0.15875000283122062, |
|
"reward_std": 0.25816794726997616, |
|
"rewards/accuracy_reward": 0.12875000196509062, |
|
"rewards/format_reward": 0.030000000586733223, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 467.41250762939455, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.4308978319168091, |
|
"kl": 0.00021507740020751954, |
|
"learning_rate": 5.309734513274336e-07, |
|
"loss": 0.0, |
|
"reward": 0.15562500259839, |
|
"reward_std": 0.27249111477285626, |
|
"rewards/accuracy_reward": 0.13000000221654773, |
|
"rewards/format_reward": 0.025625000568106772, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 396.5225093841553, |
|
"epoch": 0.08, |
|
"grad_norm": 52.731353759765625, |
|
"kl": 0.004652214050292969, |
|
"learning_rate": 7.964601769911505e-07, |
|
"loss": 0.0002, |
|
"reward": 0.31500000339001416, |
|
"reward_std": 0.3808905828744173, |
|
"rewards/accuracy_reward": 0.12812500270083546, |
|
"rewards/format_reward": 0.18687500241212546, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 185.5206272125244, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.5472067594528198, |
|
"kl": 0.0395751953125, |
|
"learning_rate": 1.0619469026548673e-06, |
|
"loss": 0.0016, |
|
"reward": 0.8362500123679638, |
|
"reward_std": 0.41473600510507824, |
|
"rewards/accuracy_reward": 0.10625000237487256, |
|
"rewards/format_reward": 0.7300000071525574, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 92.74375104904175, |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.49235209822654724, |
|
"kl": 0.11463623046875, |
|
"learning_rate": 1.3274336283185841e-06, |
|
"loss": 0.0046, |
|
"reward": 1.1543750211596489, |
|
"reward_std": 0.2653807267546654, |
|
"rewards/accuracy_reward": 0.190625003259629, |
|
"rewards/format_reward": 0.9637500062584877, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 182.87937908172609, |
|
"epoch": 0.16, |
|
"grad_norm": 0.7026166915893555, |
|
"kl": 0.10775146484375, |
|
"learning_rate": 1.592920353982301e-06, |
|
"loss": 0.0043, |
|
"reward": 1.330625008046627, |
|
"reward_std": 0.3806515397503972, |
|
"rewards/accuracy_reward": 0.4137500094715506, |
|
"rewards/format_reward": 0.9168750137090683, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 286.4493797302246, |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 0.19512760639190674, |
|
"kl": 0.071099853515625, |
|
"learning_rate": 1.8584070796460177e-06, |
|
"loss": 0.0028, |
|
"reward": 1.4037500321865082, |
|
"reward_std": 0.4500323969870806, |
|
"rewards/accuracy_reward": 0.5518750101327896, |
|
"rewards/format_reward": 0.8518750131130218, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 300.55125617980957, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.19568626582622528, |
|
"kl": 0.066259765625, |
|
"learning_rate": 2.1238938053097345e-06, |
|
"loss": 0.0027, |
|
"reward": 1.5556250244379044, |
|
"reward_std": 0.3508193654939532, |
|
"rewards/accuracy_reward": 0.6306250087916851, |
|
"rewards/format_reward": 0.9250000104308128, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 323.54563064575194, |
|
"epoch": 0.24, |
|
"grad_norm": 0.210642471909523, |
|
"kl": 0.063372802734375, |
|
"learning_rate": 2.3893805309734516e-06, |
|
"loss": 0.0025, |
|
"reward": 1.5068750262260437, |
|
"reward_std": 0.37218285240232946, |
|
"rewards/accuracy_reward": 0.5925000049173832, |
|
"rewards/format_reward": 0.9143750101327897, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 335.67500762939454, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.17836834490299225, |
|
"kl": 0.062786865234375, |
|
"learning_rate": 2.6548672566371683e-06, |
|
"loss": 0.0025, |
|
"reward": 1.5043750196695327, |
|
"reward_std": 0.38465452194213867, |
|
"rewards/accuracy_reward": 0.6131250146776438, |
|
"rewards/format_reward": 0.8912500098347664, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 302.84687995910645, |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 0.20703768730163574, |
|
"kl": 0.06512451171875, |
|
"learning_rate": 2.920353982300885e-06, |
|
"loss": 0.0026, |
|
"reward": 1.4862500309944153, |
|
"reward_std": 0.3997718315571547, |
|
"rewards/accuracy_reward": 0.6006250124424696, |
|
"rewards/format_reward": 0.8856250092387199, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 247.6512535095215, |
|
"epoch": 0.32, |
|
"grad_norm": 0.22252444922924042, |
|
"kl": 0.071343994140625, |
|
"learning_rate": 2.9996458567456176e-06, |
|
"loss": 0.0029, |
|
"reward": 1.4106250196695327, |
|
"reward_std": 0.3797724399715662, |
|
"rewards/accuracy_reward": 0.49812501221895217, |
|
"rewards/format_reward": 0.912500011920929, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 275.83062744140625, |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 704.6102905273438, |
|
"kl": 1.121209716796875, |
|
"learning_rate": 2.997911680090067e-06, |
|
"loss": 0.0449, |
|
"reward": 1.491250029206276, |
|
"reward_std": 0.33902281522750854, |
|
"rewards/accuracy_reward": 0.5725000131875276, |
|
"rewards/format_reward": 0.918750011920929, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 762.2556308746338, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 6.110317230224609, |
|
"kl": 169.40343017578124, |
|
"learning_rate": 2.9947340923033686e-06, |
|
"loss": 6.7759, |
|
"reward": 0.2593750054948032, |
|
"reward_std": 0.14749210346490144, |
|
"rewards/accuracy_reward": 0.13187500317580997, |
|
"rewards/format_reward": 0.12750000222586094, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 577.000634765625, |
|
"epoch": 0.4, |
|
"grad_norm": 1.6375890970230103, |
|
"kl": 50.33268432617187, |
|
"learning_rate": 2.99011615535883e-06, |
|
"loss": 2.0188, |
|
"reward": 0.5993750111199916, |
|
"reward_std": 0.27639281619340184, |
|
"rewards/accuracy_reward": 0.21562500661239029, |
|
"rewards/format_reward": 0.3837500066496432, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 278.7000045776367, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.1893930733203888, |
|
"kl": 0.142626953125, |
|
"learning_rate": 2.984062319172742e-06, |
|
"loss": 0.0057, |
|
"reward": 1.5175000250339508, |
|
"reward_std": 0.3350706363096833, |
|
"rewards/accuracy_reward": 0.5768750105053186, |
|
"rewards/format_reward": 0.9406250104308128, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 283.7487537384033, |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 0.3801301419734955, |
|
"kl": 0.07415771484375, |
|
"learning_rate": 2.9765784173163723e-06, |
|
"loss": 0.003, |
|
"reward": 1.428125023841858, |
|
"reward_std": 0.4501983530819416, |
|
"rewards/accuracy_reward": 0.5725000098347663, |
|
"rewards/format_reward": 0.8556250154972076, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 317.7350051879883, |
|
"epoch": 0.48, |
|
"grad_norm": 0.17540688812732697, |
|
"kl": 0.07098388671875, |
|
"learning_rate": 2.967671661394643e-06, |
|
"loss": 0.0028, |
|
"reward": 1.4887500375509262, |
|
"reward_std": 0.346977224946022, |
|
"rewards/accuracy_reward": 0.5512500097975135, |
|
"rewards/format_reward": 0.9375000059604645, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 362.6912559509277, |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 0.17630642652511597, |
|
"kl": 0.062762451171875, |
|
"learning_rate": 2.957350634096912e-06, |
|
"loss": 0.0025, |
|
"reward": 1.4950000196695328, |
|
"reward_std": 0.3771749962121248, |
|
"rewards/accuracy_reward": 0.5831250146031379, |
|
"rewards/format_reward": 0.9118750140070915, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 388.05750503540037, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.14493422210216522, |
|
"kl": 0.07095947265625, |
|
"learning_rate": 2.945625280926568e-06, |
|
"loss": 0.0028, |
|
"reward": 1.4818750262260436, |
|
"reward_std": 0.43593788109719755, |
|
"rewards/accuracy_reward": 0.6106250114738941, |
|
"rewards/format_reward": 0.8712500110268593, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 346.9868797302246, |
|
"epoch": 0.56, |
|
"grad_norm": 0.24546337127685547, |
|
"kl": 0.07967529296875, |
|
"learning_rate": 2.932506900617379e-06, |
|
"loss": 0.0032, |
|
"reward": 1.4625000298023223, |
|
"reward_std": 0.43860488161444666, |
|
"rewards/accuracy_reward": 0.5737500078976154, |
|
"rewards/format_reward": 0.8887500122189522, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 309.0206298828125, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.2144029587507248, |
|
"kl": 0.11422119140625, |
|
"learning_rate": 2.91800813424586e-06, |
|
"loss": 0.0046, |
|
"reward": 1.3962500274181366, |
|
"reward_std": 0.46159769259393213, |
|
"rewards/accuracy_reward": 0.5225000099278987, |
|
"rewards/format_reward": 0.8737500131130218, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 313.8818809509277, |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 0.16003139317035675, |
|
"kl": 0.17523193359375, |
|
"learning_rate": 2.9021429530501337e-06, |
|
"loss": 0.007, |
|
"reward": 1.4012500315904617, |
|
"reward_std": 0.4679255347698927, |
|
"rewards/accuracy_reward": 0.5293750107288361, |
|
"rewards/format_reward": 0.8718750134110451, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 302.87313079833984, |
|
"epoch": 0.64, |
|
"grad_norm": 0.17115922272205353, |
|
"kl": 0.216015625, |
|
"learning_rate": 2.8849266449670255e-06, |
|
"loss": 0.0086, |
|
"reward": 1.4156250298023223, |
|
"reward_std": 0.5012115199118853, |
|
"rewards/accuracy_reward": 0.5593750070780515, |
|
"rewards/format_reward": 0.856250011920929, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 267.08875503540037, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.237528994679451, |
|
"kl": 0.2441162109375, |
|
"learning_rate": 2.866375799900369e-06, |
|
"loss": 0.0098, |
|
"reward": 1.367500016093254, |
|
"reward_std": 0.48268986456096175, |
|
"rewards/accuracy_reward": 0.507500009611249, |
|
"rewards/format_reward": 0.8600000113248825, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 271.5650035858154, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 0.13727092742919922, |
|
"kl": 0.19796142578125, |
|
"learning_rate": 2.8465082937347156e-06, |
|
"loss": 0.0079, |
|
"reward": 1.4412500113248825, |
|
"reward_std": 0.3789402700960636, |
|
"rewards/accuracy_reward": 0.5393750105053187, |
|
"rewards/format_reward": 0.9018750116229057, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 331.74375648498534, |
|
"epoch": 0.72, |
|
"grad_norm": 0.14567354321479797, |
|
"kl": 0.2938720703125, |
|
"learning_rate": 2.8253432711098524e-06, |
|
"loss": 0.0118, |
|
"reward": 1.3037500187754631, |
|
"reward_std": 0.5119317132979632, |
|
"rewards/accuracy_reward": 0.47687500678002837, |
|
"rewards/format_reward": 0.8268750131130218, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 270.1787559509277, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.16504357755184174, |
|
"kl": 0.2083251953125, |
|
"learning_rate": 2.802901126972727e-06, |
|
"loss": 0.0083, |
|
"reward": 1.407500022649765, |
|
"reward_std": 0.4102827299386263, |
|
"rewards/accuracy_reward": 0.5068750072270631, |
|
"rewards/format_reward": 0.9006250113248825, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 394.0706314086914, |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 0.13634301722049713, |
|
"kl": 0.244677734375, |
|
"learning_rate": 2.7792034869245574e-06, |
|
"loss": 0.0098, |
|
"reward": 1.365000019967556, |
|
"reward_std": 0.552520602196455, |
|
"rewards/accuracy_reward": 0.5725000109523535, |
|
"rewards/format_reward": 0.7925000131130219, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 359.3012565612793, |
|
"epoch": 0.8, |
|
"grad_norm": 0.12871386110782623, |
|
"kl": 0.17838134765625, |
|
"learning_rate": 2.7542731863820665e-06, |
|
"loss": 0.0071, |
|
"reward": 1.4187500134110451, |
|
"reward_std": 0.45300735253840685, |
|
"rewards/accuracy_reward": 0.5175000108778477, |
|
"rewards/format_reward": 0.9012500166893005, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 319.09125633239745, |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 0.15812256932258606, |
|
"kl": 0.2130859375, |
|
"learning_rate": 2.7281342485729135e-06, |
|
"loss": 0.0085, |
|
"reward": 1.4262500196695327, |
|
"reward_std": 0.47630340307950975, |
|
"rewards/accuracy_reward": 0.5343750081956387, |
|
"rewards/format_reward": 0.891875010728836, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 236.86187667846679, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.14847490191459656, |
|
"kl": 0.1891845703125, |
|
"learning_rate": 2.7008118613865407e-06, |
|
"loss": 0.0076, |
|
"reward": 1.5068750202655792, |
|
"reward_std": 0.35237235836684705, |
|
"rewards/accuracy_reward": 0.5625000081956386, |
|
"rewards/format_reward": 0.9443750113248826, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 280.200630569458, |
|
"epoch": 0.88, |
|
"grad_norm": 0.1344127207994461, |
|
"kl": 0.1708984375, |
|
"learning_rate": 2.6723323531027237e-06, |
|
"loss": 0.0068, |
|
"reward": 1.5125000149011611, |
|
"reward_std": 0.392687563598156, |
|
"rewards/accuracy_reward": 0.585625009983778, |
|
"rewards/format_reward": 0.9268750116229058, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 340.8256294250488, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 0.12343962490558624, |
|
"kl": 0.20294189453125, |
|
"learning_rate": 2.642723167021233e-06, |
|
"loss": 0.0081, |
|
"reward": 1.4475000262260438, |
|
"reward_std": 0.42229729425162077, |
|
"rewards/accuracy_reward": 0.554375009611249, |
|
"rewards/format_reward": 0.8931250154972077, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 321.01875457763674, |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.1363190859556198, |
|
"kl": 0.17435302734375, |
|
"learning_rate": 2.612012835017041e-06, |
|
"loss": 0.007, |
|
"reward": 1.5350000232458114, |
|
"reward_std": 0.38834156226366756, |
|
"rewards/accuracy_reward": 0.6112500090152025, |
|
"rewards/format_reward": 0.923750014603138, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 356.4068801879883, |
|
"epoch": 0.96, |
|
"grad_norm": 0.1496347039937973, |
|
"kl": 0.245654296875, |
|
"learning_rate": 2.5802309500465564e-06, |
|
"loss": 0.0098, |
|
"reward": 1.4331250190734863, |
|
"reward_std": 0.5568725638091564, |
|
"rewards/accuracy_reward": 0.5818750094622374, |
|
"rewards/format_reward": 0.8512500137090683, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 315.9900047302246, |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 0.15564337372779846, |
|
"kl": 0.2264892578125, |
|
"learning_rate": 2.547408137631396e-06, |
|
"loss": 0.0091, |
|
"reward": 1.4825000256299972, |
|
"reward_std": 0.47740888558328154, |
|
"rewards/accuracy_reward": 0.5875000122934579, |
|
"rewards/format_reward": 0.8950000122189522, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 292.8037563323975, |
|
"epoch": 1.0133333333333334, |
|
"grad_norm": 0.11661098897457123, |
|
"kl": 0.18624267578125, |
|
"learning_rate": 2.5135760263471446e-06, |
|
"loss": 0.0075, |
|
"reward": 1.5643750160932541, |
|
"reward_std": 0.39525898918509483, |
|
"rewards/accuracy_reward": 0.6393750097602606, |
|
"rewards/format_reward": 0.9250000074505806, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 345.10375556945803, |
|
"epoch": 1.04, |
|
"grad_norm": 0.10983074456453323, |
|
"kl": 0.2192138671875, |
|
"learning_rate": 2.478767217345571e-06, |
|
"loss": 0.0088, |
|
"reward": 1.474375021457672, |
|
"reward_std": 0.46512946095317603, |
|
"rewards/accuracy_reward": 0.5981250118464232, |
|
"rewards/format_reward": 0.8762500181794166, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 323.054377746582, |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.13792556524276733, |
|
"kl": 0.1779541015625, |
|
"learning_rate": 2.443015252939646e-06, |
|
"loss": 0.0071, |
|
"reward": 1.4987500220537187, |
|
"reward_std": 0.3944621989503503, |
|
"rewards/accuracy_reward": 0.578125012293458, |
|
"rewards/format_reward": 0.9206250131130218, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 337.12000579833983, |
|
"epoch": 1.0933333333333333, |
|
"grad_norm": 0.10129429399967194, |
|
"kl": 0.20455322265625, |
|
"learning_rate": 2.406354584281642e-06, |
|
"loss": 0.0082, |
|
"reward": 1.52812502682209, |
|
"reward_std": 0.4762104984372854, |
|
"rewards/accuracy_reward": 0.6387500144541264, |
|
"rewards/format_reward": 0.8893750131130218, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 317.5525062561035, |
|
"epoch": 1.12, |
|
"grad_norm": 0.12564826011657715, |
|
"kl": 0.210009765625, |
|
"learning_rate": 2.3688205381654686e-06, |
|
"loss": 0.0084, |
|
"reward": 1.4950000256299973, |
|
"reward_std": 0.4526091780513525, |
|
"rewards/accuracy_reward": 0.5925000101327896, |
|
"rewards/format_reward": 0.9025000095367431, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 288.7437553405762, |
|
"epoch": 1.1466666666666667, |
|
"grad_norm": 0.10607220977544785, |
|
"kl": 0.16053466796875, |
|
"learning_rate": 2.330449282985219e-06, |
|
"loss": 0.0064, |
|
"reward": 1.590625023841858, |
|
"reward_std": 0.3875279016792774, |
|
"rewards/accuracy_reward": 0.6481250151991844, |
|
"rewards/format_reward": 0.9425000056624413, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 359.6525058746338, |
|
"epoch": 1.1733333333333333, |
|
"grad_norm": 0.08999033272266388, |
|
"kl": 0.16671142578125, |
|
"learning_rate": 2.2912777938827377e-06, |
|
"loss": 0.0067, |
|
"reward": 1.5606250256299972, |
|
"reward_std": 0.42853499911725523, |
|
"rewards/accuracy_reward": 0.647500005364418, |
|
"rewards/format_reward": 0.9131250083446503, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 400.6356315612793, |
|
"epoch": 1.2, |
|
"grad_norm": 0.09623633325099945, |
|
"kl": 0.1906982421875, |
|
"learning_rate": 2.251343817117798e-06, |
|
"loss": 0.0076, |
|
"reward": 1.514375028014183, |
|
"reward_std": 0.49012119248509406, |
|
"rewards/accuracy_reward": 0.6412500105798244, |
|
"rewards/format_reward": 0.8731250122189522, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 363.6906307220459, |
|
"epoch": 1.2266666666666666, |
|
"grad_norm": 0.09504391998052597, |
|
"kl": 0.1889892578125, |
|
"learning_rate": 2.2106858336952155e-06, |
|
"loss": 0.0076, |
|
"reward": 1.4875000238418579, |
|
"reward_std": 0.43876917734742166, |
|
"rewards/accuracy_reward": 0.5943750120699406, |
|
"rewards/format_reward": 0.8931250110268593, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 295.2150051116943, |
|
"epoch": 1.2533333333333334, |
|
"grad_norm": 0.14019078016281128, |
|
"kl": 0.18583984375, |
|
"learning_rate": 2.169343022283947e-06, |
|
"loss": 0.0074, |
|
"reward": 1.5243750274181367, |
|
"reward_std": 0.41497170850634574, |
|
"rewards/accuracy_reward": 0.5993750095367432, |
|
"rewards/format_reward": 0.9250000089406967, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 295.10875549316404, |
|
"epoch": 1.28, |
|
"grad_norm": 0.18233150243759155, |
|
"kl": 0.1969482421875, |
|
"learning_rate": 2.127355221463915e-06, |
|
"loss": 0.0079, |
|
"reward": 1.4793750196695328, |
|
"reward_std": 0.409528423845768, |
|
"rewards/accuracy_reward": 0.5656250078231096, |
|
"rewards/format_reward": 0.9137500107288361, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 321.9537551879883, |
|
"epoch": 1.3066666666666666, |
|
"grad_norm": 0.11022540926933289, |
|
"kl": 0.22891845703125, |
|
"learning_rate": 2.084762891336928e-06, |
|
"loss": 0.0092, |
|
"reward": 1.4631250321865081, |
|
"reward_std": 0.47595020812004807, |
|
"rewards/accuracy_reward": 0.5818750143051148, |
|
"rewards/format_reward": 0.8812500149011612, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 329.26063079833983, |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.10155805945396423, |
|
"kl": 0.20677490234375, |
|
"learning_rate": 2.041607074538693e-06, |
|
"loss": 0.0083, |
|
"reward": 1.491875022649765, |
|
"reward_std": 0.45835329182446005, |
|
"rewards/accuracy_reward": 0.5962500102818012, |
|
"rewards/format_reward": 0.8956250146031379, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 335.51375617980955, |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.11989594250917435, |
|
"kl": 0.18310546875, |
|
"learning_rate": 1.9979293566894888e-06, |
|
"loss": 0.0073, |
|
"reward": 1.529375022649765, |
|
"reward_std": 0.4263172609731555, |
|
"rewards/accuracy_reward": 0.6125000081956387, |
|
"rewards/format_reward": 0.9168750137090683, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 348.62563133239746, |
|
"epoch": 1.3866666666666667, |
|
"grad_norm": 0.13145174086093903, |
|
"kl": 0.18135986328125, |
|
"learning_rate": 1.9537718263216137e-06, |
|
"loss": 0.0073, |
|
"reward": 1.4893750190734862, |
|
"reward_std": 0.42451257910579443, |
|
"rewards/accuracy_reward": 0.5843750070780516, |
|
"rewards/format_reward": 0.9050000116229058, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 343.5318801879883, |
|
"epoch": 1.4133333333333333, |
|
"grad_norm": 0.11078327149152756, |
|
"kl": 0.19310302734375, |
|
"learning_rate": 1.909177034322215e-06, |
|
"loss": 0.0077, |
|
"reward": 1.5475000232458114, |
|
"reward_std": 0.45979407913982867, |
|
"rewards/accuracy_reward": 0.6500000119209289, |
|
"rewards/format_reward": 0.8975000113248826, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 316.70063095092775, |
|
"epoch": 1.44, |
|
"grad_norm": 0.13980206847190857, |
|
"kl": 0.1595458984375, |
|
"learning_rate": 1.8641879529305908e-06, |
|
"loss": 0.0064, |
|
"reward": 1.5656250238418579, |
|
"reward_std": 0.3269194783642888, |
|
"rewards/accuracy_reward": 0.6243750140070915, |
|
"rewards/format_reward": 0.9412500038743019, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 345.5168800354004, |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 0.12923210859298706, |
|
"kl": 0.1690673828125, |
|
"learning_rate": 1.818847934329465e-06, |
|
"loss": 0.0068, |
|
"reward": 1.4968750149011611, |
|
"reward_std": 0.4168443286791444, |
|
"rewards/accuracy_reward": 0.584375013411045, |
|
"rewards/format_reward": 0.912500013411045, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 353.0875053405762, |
|
"epoch": 1.4933333333333334, |
|
"grad_norm": 0.1003999263048172, |
|
"kl": 0.20703125, |
|
"learning_rate": 1.7732006688701488e-06, |
|
"loss": 0.0083, |
|
"reward": 1.458750021457672, |
|
"reward_std": 0.4720118813216686, |
|
"rewards/accuracy_reward": 0.5718750119209289, |
|
"rewards/format_reward": 0.8868750110268593, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 312.12188186645506, |
|
"epoch": 1.52, |
|
"grad_norm": 0.16526304185390472, |
|
"kl": 0.20828857421875, |
|
"learning_rate": 1.727290142971832e-06, |
|
"loss": 0.0083, |
|
"reward": 1.475625030696392, |
|
"reward_std": 0.4474962681531906, |
|
"rewards/accuracy_reward": 0.5737500060349703, |
|
"rewards/format_reward": 0.901875014603138, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 318.03625679016113, |
|
"epoch": 1.5466666666666666, |
|
"grad_norm": 0.121968574821949, |
|
"kl": 0.17958984375, |
|
"learning_rate": 1.6811605967355838e-06, |
|
"loss": 0.0072, |
|
"reward": 1.5162500262260437, |
|
"reward_std": 0.3832638839259744, |
|
"rewards/accuracy_reward": 0.5818750113248825, |
|
"rewards/format_reward": 0.9343750089406967, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 387.7318817138672, |
|
"epoch": 1.5733333333333333, |
|
"grad_norm": 0.11350403726100922, |
|
"kl": 0.19808349609375, |
|
"learning_rate": 1.6348564813138958e-06, |
|
"loss": 0.0079, |
|
"reward": 1.5393750190734863, |
|
"reward_std": 0.43136950451880696, |
|
"rewards/accuracy_reward": 0.6400000102818012, |
|
"rewards/format_reward": 0.8993750125169754, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 416.05750732421876, |
|
"epoch": 1.6, |
|
"grad_norm": 0.17338570952415466, |
|
"kl": 0.23121337890625, |
|
"learning_rate": 1.588422416076859e-06, |
|
"loss": 0.0092, |
|
"reward": 1.4106250256299973, |
|
"reward_std": 0.5426479373127222, |
|
"rewards/accuracy_reward": 0.5600000061094761, |
|
"rewards/format_reward": 0.8506250083446503, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 366.97062797546386, |
|
"epoch": 1.6266666666666667, |
|
"grad_norm": 0.5024117231369019, |
|
"kl": 0.2103515625, |
|
"learning_rate": 1.5419031456162405e-06, |
|
"loss": 0.0084, |
|
"reward": 1.4693750351667405, |
|
"reward_std": 0.4877826740965247, |
|
"rewards/accuracy_reward": 0.588750010728836, |
|
"rewards/format_reward": 0.8806250095367432, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 346.7418815612793, |
|
"epoch": 1.6533333333333333, |
|
"grad_norm": 0.13114424049854279, |
|
"kl": 0.25361328125, |
|
"learning_rate": 1.4953434966288927e-06, |
|
"loss": 0.0101, |
|
"reward": 1.459375011920929, |
|
"reward_std": 0.5048464283347129, |
|
"rewards/accuracy_reward": 0.5925000093877315, |
|
"rewards/format_reward": 0.8668750107288361, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 310.78437995910645, |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.13125382363796234, |
|
"kl": 0.2142822265625, |
|
"learning_rate": 1.4487883347210483e-06, |
|
"loss": 0.0086, |
|
"reward": 1.4768750280141831, |
|
"reward_std": 0.4538666373118758, |
|
"rewards/accuracy_reward": 0.5725000105798245, |
|
"rewards/format_reward": 0.9043750137090683, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 267.90562973022463, |
|
"epoch": 1.7066666666666666, |
|
"grad_norm": 0.1306937038898468, |
|
"kl": 0.1768310546875, |
|
"learning_rate": 1.4022825211751206e-06, |
|
"loss": 0.0071, |
|
"reward": 1.5306250214576722, |
|
"reward_std": 0.3767278905957937, |
|
"rewards/accuracy_reward": 0.5837500121444463, |
|
"rewards/format_reward": 0.9468750119209289, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 300.6637557983398, |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 0.1377246230840683, |
|
"kl": 0.16981201171875, |
|
"learning_rate": 1.355870869720669e-06, |
|
"loss": 0.0068, |
|
"reward": 1.5968750298023224, |
|
"reward_std": 0.3849057173356414, |
|
"rewards/accuracy_reward": 0.6581250108778477, |
|
"rewards/format_reward": 0.938750010728836, |
|
"step": 650 |
|
}, |
|
{ |
|
"completion_length": 308.26562995910643, |
|
"epoch": 1.76, |
|
"grad_norm": 0.1021685004234314, |
|
"kl": 0.1703125, |
|
"learning_rate": 1.3095981033511883e-06, |
|
"loss": 0.0068, |
|
"reward": 1.4662500232458116, |
|
"reward_std": 0.3969815358519554, |
|
"rewards/accuracy_reward": 0.537500013411045, |
|
"rewards/format_reward": 0.9287500098347664, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 326.6156311035156, |
|
"epoch": 1.7866666666666666, |
|
"grad_norm": 0.15498338639736176, |
|
"kl": 0.192919921875, |
|
"learning_rate": 1.2635088112283316e-06, |
|
"loss": 0.0077, |
|
"reward": 1.4600000232458115, |
|
"reward_std": 0.4722231462597847, |
|
"rewards/accuracy_reward": 0.5543750062584877, |
|
"rewards/format_reward": 0.9056250154972076, |
|
"step": 670 |
|
}, |
|
{ |
|
"completion_length": 318.85250625610354, |
|
"epoch": 1.8133333333333335, |
|
"grad_norm": 0.11492400616407394, |
|
"kl": 0.18997802734375, |
|
"learning_rate": 1.217647405715099e-06, |
|
"loss": 0.0076, |
|
"reward": 1.5250000268220902, |
|
"reward_std": 0.4434811886399984, |
|
"rewards/accuracy_reward": 0.6143750160932541, |
|
"rewards/format_reward": 0.9106250107288361, |
|
"step": 680 |
|
}, |
|
{ |
|
"completion_length": 297.7112564086914, |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.10103321820497513, |
|
"kl": 0.208349609375, |
|
"learning_rate": 1.1720580795793865e-06, |
|
"loss": 0.0083, |
|
"reward": 1.4731250196695327, |
|
"reward_std": 0.4129851894453168, |
|
"rewards/accuracy_reward": 0.5556250110268592, |
|
"rewards/format_reward": 0.917500016093254, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 283.28375282287595, |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 0.15206778049468994, |
|
"kl": 0.18507080078125, |
|
"learning_rate": 1.1267847634091462e-06, |
|
"loss": 0.0074, |
|
"reward": 1.5437500149011611, |
|
"reward_std": 0.3982010118663311, |
|
"rewards/accuracy_reward": 0.6137500122189522, |
|
"rewards/format_reward": 0.9300000086426735, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 295.87500762939453, |
|
"epoch": 1.8933333333333333, |
|
"grad_norm": 0.09867344796657562, |
|
"kl": 0.16317138671875, |
|
"learning_rate": 1.0818710832801818e-06, |
|
"loss": 0.0065, |
|
"reward": 1.5937500238418578, |
|
"reward_std": 0.3769227135926485, |
|
"rewards/accuracy_reward": 0.6562500093132257, |
|
"rewards/format_reward": 0.9375000089406967, |
|
"step": 710 |
|
}, |
|
{ |
|
"completion_length": 309.8425048828125, |
|
"epoch": 1.92, |
|
"grad_norm": 0.11957939714193344, |
|
"kl": 0.14814453125, |
|
"learning_rate": 1.0373603187173825e-06, |
|
"loss": 0.0059, |
|
"reward": 1.5743750184774399, |
|
"reward_std": 0.34016798436641693, |
|
"rewards/accuracy_reward": 0.6218750074505806, |
|
"rewards/format_reward": 0.9525000095367432, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 338.95750427246094, |
|
"epoch": 1.9466666666666668, |
|
"grad_norm": 0.12207633256912231, |
|
"kl": 0.1755126953125, |
|
"learning_rate": 9.932953609898924e-07, |
|
"loss": 0.007, |
|
"reward": 1.5612500250339507, |
|
"reward_std": 0.40149390175938604, |
|
"rewards/accuracy_reward": 0.6425000101327896, |
|
"rewards/format_reward": 0.9187500134110451, |
|
"step": 730 |
|
}, |
|
{ |
|
"completion_length": 344.77625541687013, |
|
"epoch": 1.9733333333333334, |
|
"grad_norm": 0.1330130398273468, |
|
"kl": 0.18267822265625, |
|
"learning_rate": 9.497186717804155e-07, |
|
"loss": 0.0073, |
|
"reward": 1.5181250244379043, |
|
"reward_std": 0.3998035121709108, |
|
"rewards/accuracy_reward": 0.6037500113248825, |
|
"rewards/format_reward": 0.9143750131130218, |
|
"step": 740 |
|
}, |
|
{ |
|
"completion_length": 338.76000556945803, |
|
"epoch": 2.0, |
|
"grad_norm": 0.1740158349275589, |
|
"kl": 0.233056640625, |
|
"learning_rate": 9.066722422684706e-07, |
|
"loss": 0.0093, |
|
"reward": 1.5250000178813934, |
|
"reward_std": 0.42267096769064666, |
|
"rewards/accuracy_reward": 0.607500009983778, |
|
"rewards/format_reward": 0.9175000071525574, |
|
"step": 750 |
|
}, |
|
{ |
|
"completion_length": 313.39938125610354, |
|
"epoch": 2.026666666666667, |
|
"grad_norm": 0.17406630516052246, |
|
"kl": 0.16112060546875, |
|
"learning_rate": 8.641975526670375e-07, |
|
"loss": 0.0064, |
|
"reward": 1.5193750202655791, |
|
"reward_std": 0.38527730852365494, |
|
"rewards/accuracy_reward": 0.5837500056251883, |
|
"rewards/format_reward": 0.9356250122189522, |
|
"step": 760 |
|
}, |
|
{ |
|
"completion_length": 335.1118816375732, |
|
"epoch": 2.0533333333333332, |
|
"grad_norm": 0.11110047250986099, |
|
"kl": 0.169952392578125, |
|
"learning_rate": 8.223355322515711e-07, |
|
"loss": 0.0068, |
|
"reward": 1.5487500220537185, |
|
"reward_std": 0.4174341483041644, |
|
"rewards/accuracy_reward": 0.6300000164657831, |
|
"rewards/format_reward": 0.9187500104308128, |
|
"step": 770 |
|
}, |
|
{ |
|
"completion_length": 329.0106330871582, |
|
"epoch": 2.08, |
|
"grad_norm": 0.12786993384361267, |
|
"kl": 0.1651123046875, |
|
"learning_rate": 7.811265199199153e-07, |
|
"loss": 0.0066, |
|
"reward": 1.4956250309944152, |
|
"reward_std": 0.39866077806800604, |
|
"rewards/accuracy_reward": 0.5656250093132258, |
|
"rewards/format_reward": 0.9300000071525574, |
|
"step": 780 |
|
}, |
|
{ |
|
"completion_length": 317.1981311798096, |
|
"epoch": 2.1066666666666665, |
|
"grad_norm": 0.15374892950057983, |
|
"kl": 0.16614990234375, |
|
"learning_rate": 7.406102253211037e-07, |
|
"loss": 0.0066, |
|
"reward": 1.554375022649765, |
|
"reward_std": 0.38757612481713294, |
|
"rewards/accuracy_reward": 0.6156250141561032, |
|
"rewards/format_reward": 0.93875000923872, |
|
"step": 790 |
|
}, |
|
{ |
|
"completion_length": 356.8893817901611, |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 0.09933959692716599, |
|
"kl": 0.18651123046875, |
|
"learning_rate": 7.008256905905285e-07, |
|
"loss": 0.0075, |
|
"reward": 1.4968750149011611, |
|
"reward_std": 0.434663244150579, |
|
"rewards/accuracy_reward": 0.5887500114738942, |
|
"rewards/format_reward": 0.9081250086426735, |
|
"step": 800 |
|
}, |
|
{ |
|
"completion_length": 334.13250694274905, |
|
"epoch": 2.16, |
|
"grad_norm": 0.10136093944311142, |
|
"kl": 0.18509521484375, |
|
"learning_rate": 6.618112527283208e-07, |
|
"loss": 0.0074, |
|
"reward": 1.5343750268220901, |
|
"reward_std": 0.41338230539113285, |
|
"rewards/accuracy_reward": 0.6100000128149986, |
|
"rewards/format_reward": 0.9243750125169754, |
|
"step": 810 |
|
}, |
|
{ |
|
"completion_length": 331.4456298828125, |
|
"epoch": 2.1866666666666665, |
|
"grad_norm": 0.10766751319169998, |
|
"kl": 0.17252197265625, |
|
"learning_rate": 6.236045066572228e-07, |
|
"loss": 0.0069, |
|
"reward": 1.5537500262260437, |
|
"reward_std": 0.3903699716553092, |
|
"rewards/accuracy_reward": 0.631250013038516, |
|
"rewards/format_reward": 0.9225000113248825, |
|
"step": 820 |
|
}, |
|
{ |
|
"completion_length": 299.4775054931641, |
|
"epoch": 2.2133333333333334, |
|
"grad_norm": 0.09992185235023499, |
|
"kl": 0.1746337890625, |
|
"learning_rate": 5.862422689955269e-07, |
|
"loss": 0.007, |
|
"reward": 1.5581250220537186, |
|
"reward_std": 0.35976272616535426, |
|
"rewards/accuracy_reward": 0.6206250134855509, |
|
"rewards/format_reward": 0.9375000104308129, |
|
"step": 830 |
|
}, |
|
{ |
|
"completion_length": 323.91625518798827, |
|
"epoch": 2.24, |
|
"grad_norm": 0.07573343813419342, |
|
"kl": 0.167333984375, |
|
"learning_rate": 5.497605425800119e-07, |
|
"loss": 0.0067, |
|
"reward": 1.5562500298023223, |
|
"reward_std": 0.3865811740979552, |
|
"rewards/accuracy_reward": 0.6243750125169754, |
|
"rewards/format_reward": 0.9318750128149986, |
|
"step": 840 |
|
}, |
|
{ |
|
"completion_length": 317.47250442504884, |
|
"epoch": 2.2666666666666666, |
|
"grad_norm": 0.11399682611227036, |
|
"kl": 0.16575927734375, |
|
"learning_rate": 5.141944817730411e-07, |
|
"loss": 0.0066, |
|
"reward": 1.5550000220537186, |
|
"reward_std": 0.3845184024423361, |
|
"rewards/accuracy_reward": 0.6225000105798244, |
|
"rewards/format_reward": 0.9325000122189522, |
|
"step": 850 |
|
}, |
|
{ |
|
"completion_length": 309.3156311035156, |
|
"epoch": 2.2933333333333334, |
|
"grad_norm": 0.16207897663116455, |
|
"kl": 0.171240234375, |
|
"learning_rate": 4.795783585872737e-07, |
|
"loss": 0.0068, |
|
"reward": 1.5768750220537187, |
|
"reward_std": 0.35651344805955887, |
|
"rewards/accuracy_reward": 0.6406250078231096, |
|
"rewards/format_reward": 0.9362500071525574, |
|
"step": 860 |
|
}, |
|
{ |
|
"completion_length": 327.7281307220459, |
|
"epoch": 2.32, |
|
"grad_norm": 0.14628101885318756, |
|
"kl": 0.20667724609375, |
|
"learning_rate": 4.4594552966061055e-07, |
|
"loss": 0.0083, |
|
"reward": 1.5000000178813935, |
|
"reward_std": 0.4507721956819296, |
|
"rewards/accuracy_reward": 0.5931250065565109, |
|
"rewards/format_reward": 0.9068750143051147, |
|
"step": 870 |
|
}, |
|
{ |
|
"completion_length": 349.7168792724609, |
|
"epoch": 2.3466666666666667, |
|
"grad_norm": 0.12691651284694672, |
|
"kl": 0.19437255859375, |
|
"learning_rate": 4.1332840411322373e-07, |
|
"loss": 0.0078, |
|
"reward": 1.5350000202655791, |
|
"reward_std": 0.44358963407576085, |
|
"rewards/accuracy_reward": 0.6350000135600566, |
|
"rewards/format_reward": 0.9000000104308128, |
|
"step": 880 |
|
}, |
|
{ |
|
"completion_length": 324.5581310272217, |
|
"epoch": 2.3733333333333335, |
|
"grad_norm": 0.17686228454113007, |
|
"kl": 0.18092041015625, |
|
"learning_rate": 3.817584123176149e-07, |
|
"loss": 0.0072, |
|
"reward": 1.564375028014183, |
|
"reward_std": 0.4211408071219921, |
|
"rewards/accuracy_reward": 0.6462500020861626, |
|
"rewards/format_reward": 0.9181250125169754, |
|
"step": 890 |
|
}, |
|
{ |
|
"completion_length": 320.24062728881836, |
|
"epoch": 2.4, |
|
"grad_norm": 0.12916938960552216, |
|
"kl": 0.162109375, |
|
"learning_rate": 3.5126597561182106e-07, |
|
"loss": 0.0065, |
|
"reward": 1.6000000268220902, |
|
"reward_std": 0.3978784864768386, |
|
"rewards/accuracy_reward": 0.6662500157952309, |
|
"rewards/format_reward": 0.9337500125169754, |
|
"step": 900 |
|
}, |
|
{ |
|
"completion_length": 329.11000671386716, |
|
"epoch": 2.4266666666666667, |
|
"grad_norm": 0.14607831835746765, |
|
"kl": 0.2010498046875, |
|
"learning_rate": 3.2188047698493277e-07, |
|
"loss": 0.008, |
|
"reward": 1.5743750303983688, |
|
"reward_std": 0.39661835934966805, |
|
"rewards/accuracy_reward": 0.641875009983778, |
|
"rewards/format_reward": 0.9325000122189522, |
|
"step": 910 |
|
}, |
|
{ |
|
"completion_length": 324.65750617980956, |
|
"epoch": 2.453333333333333, |
|
"grad_norm": 0.1194528341293335, |
|
"kl": 0.170458984375, |
|
"learning_rate": 2.9363023276319157e-07, |
|
"loss": 0.0068, |
|
"reward": 1.529375022649765, |
|
"reward_std": 0.3923725115135312, |
|
"rewards/accuracy_reward": 0.6093750111758709, |
|
"rewards/format_reward": 0.9200000137090683, |
|
"step": 920 |
|
}, |
|
{ |
|
"completion_length": 338.57312965393066, |
|
"epoch": 2.48, |
|
"grad_norm": 0.11762479692697525, |
|
"kl": 0.1827392578125, |
|
"learning_rate": 2.6654246532392954e-07, |
|
"loss": 0.0073, |
|
"reward": 1.5650000244379043, |
|
"reward_std": 0.46239927411079407, |
|
"rewards/accuracy_reward": 0.6618750102818012, |
|
"rewards/format_reward": 0.903125011920929, |
|
"step": 930 |
|
}, |
|
{ |
|
"completion_length": 309.353129196167, |
|
"epoch": 2.506666666666667, |
|
"grad_norm": 0.13050219416618347, |
|
"kl": 0.17652587890625, |
|
"learning_rate": 2.406432768636658e-07, |
|
"loss": 0.0071, |
|
"reward": 1.5493750274181366, |
|
"reward_std": 0.4042684996500611, |
|
"rewards/accuracy_reward": 0.6293750092387199, |
|
"rewards/format_reward": 0.9200000166893005, |
|
"step": 940 |
|
}, |
|
{ |
|
"completion_length": 318.59750709533694, |
|
"epoch": 2.533333333333333, |
|
"grad_norm": 0.10761768370866776, |
|
"kl": 0.17911376953125, |
|
"learning_rate": 2.1595762424561588e-07, |
|
"loss": 0.0072, |
|
"reward": 1.5812500208616256, |
|
"reward_std": 0.39062286671251056, |
|
"rewards/accuracy_reward": 0.6731250114738941, |
|
"rewards/format_reward": 0.9081250116229057, |
|
"step": 950 |
|
}, |
|
{ |
|
"completion_length": 320.97687950134275, |
|
"epoch": 2.56, |
|
"grad_norm": 0.11984766274690628, |
|
"kl": 0.174462890625, |
|
"learning_rate": 1.9250929495087294e-07, |
|
"loss": 0.007, |
|
"reward": 1.513750022649765, |
|
"reward_std": 0.4024242129176855, |
|
"rewards/accuracy_reward": 0.5925000108778476, |
|
"rewards/format_reward": 0.921250008046627, |
|
"step": 960 |
|
}, |
|
{ |
|
"completion_length": 318.3468811035156, |
|
"epoch": 2.586666666666667, |
|
"grad_norm": 0.0903179794549942, |
|
"kl": 0.1794921875, |
|
"learning_rate": 1.703208841564171e-07, |
|
"loss": 0.0072, |
|
"reward": 1.5906250119209289, |
|
"reward_std": 0.40222617890685797, |
|
"rewards/accuracy_reward": 0.6681250132620334, |
|
"rewards/format_reward": 0.9225000128149986, |
|
"step": 970 |
|
}, |
|
{ |
|
"completion_length": 302.10000534057616, |
|
"epoch": 2.6133333333333333, |
|
"grad_norm": 0.1316317617893219, |
|
"kl": 0.1739013671875, |
|
"learning_rate": 1.4941377296204656e-07, |
|
"loss": 0.007, |
|
"reward": 1.5762500196695328, |
|
"reward_std": 0.3883319929242134, |
|
"rewards/accuracy_reward": 0.6468750081956387, |
|
"rewards/format_reward": 0.9293750151991844, |
|
"step": 980 |
|
}, |
|
{ |
|
"completion_length": 319.00313262939454, |
|
"epoch": 2.64, |
|
"grad_norm": 0.1167697086930275, |
|
"kl": 0.17550048828125, |
|
"learning_rate": 1.2980810778722047e-07, |
|
"loss": 0.007, |
|
"reward": 1.5487500220537185, |
|
"reward_std": 0.41231051571667193, |
|
"rewards/accuracy_reward": 0.6293750144541264, |
|
"rewards/format_reward": 0.9193750113248825, |
|
"step": 990 |
|
}, |
|
{ |
|
"completion_length": 302.03875694274905, |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.11268429458141327, |
|
"kl": 0.1760498046875, |
|
"learning_rate": 1.1152278095764917e-07, |
|
"loss": 0.007, |
|
"reward": 1.5656250268220901, |
|
"reward_std": 0.3788869069889188, |
|
"rewards/accuracy_reward": 0.6337500095367432, |
|
"rewards/format_reward": 0.9318750128149986, |
|
"step": 1000 |
|
}, |
|
{ |
|
"completion_length": 314.637504196167, |
|
"epoch": 2.6933333333333334, |
|
"grad_norm": 0.09960366785526276, |
|
"kl": 0.1811279296875, |
|
"learning_rate": 9.457541250035762e-08, |
|
"loss": 0.0072, |
|
"reward": 1.5493750303983689, |
|
"reward_std": 0.407411840185523, |
|
"rewards/accuracy_reward": 0.6312500149011612, |
|
"rewards/format_reward": 0.9181250125169754, |
|
"step": 1010 |
|
}, |
|
{ |
|
"completion_length": 313.1293800354004, |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.11189663410186768, |
|
"kl": 0.1806884765625, |
|
"learning_rate": 7.898233316474724e-08, |
|
"loss": 0.0072, |
|
"reward": 1.5737500235438346, |
|
"reward_std": 0.38581568617373707, |
|
"rewards/accuracy_reward": 0.6493750059977174, |
|
"rewards/format_reward": 0.9243750110268593, |
|
"step": 1020 |
|
}, |
|
{ |
|
"completion_length": 294.27250480651855, |
|
"epoch": 2.7466666666666666, |
|
"grad_norm": 0.14609037339687347, |
|
"kl": 0.18486328125, |
|
"learning_rate": 6.475856868603475e-08, |
|
"loss": 0.0074, |
|
"reward": 1.6087500244379043, |
|
"reward_std": 0.38516267221421, |
|
"rewards/accuracy_reward": 0.6806250080466271, |
|
"rewards/format_reward": 0.9281250074505806, |
|
"step": 1030 |
|
}, |
|
{ |
|
"completion_length": 321.70563011169435, |
|
"epoch": 2.7733333333333334, |
|
"grad_norm": 0.10615142434835434, |
|
"kl": 0.17933349609375, |
|
"learning_rate": 5.191782530621553e-08, |
|
"loss": 0.0072, |
|
"reward": 1.5543750256299973, |
|
"reward_std": 0.39196000918745993, |
|
"rewards/accuracy_reward": 0.6375000063329935, |
|
"rewards/format_reward": 0.9168750092387199, |
|
"step": 1040 |
|
}, |
|
{ |
|
"completion_length": 329.3381317138672, |
|
"epoch": 2.8, |
|
"grad_norm": 0.10510344058275223, |
|
"kl": 0.17608642578125, |
|
"learning_rate": 4.0472476566516036e-08, |
|
"loss": 0.007, |
|
"reward": 1.5681250244379044, |
|
"reward_std": 0.43267819974571464, |
|
"rewards/accuracy_reward": 0.6543750088661909, |
|
"rewards/format_reward": 0.9137500122189521, |
|
"step": 1050 |
|
}, |
|
{ |
|
"completion_length": 322.91062889099123, |
|
"epoch": 2.8266666666666667, |
|
"grad_norm": 0.10127785056829453, |
|
"kl": 0.168310546875, |
|
"learning_rate": 3.043355138405418e-08, |
|
"loss": 0.0067, |
|
"reward": 1.5443750321865082, |
|
"reward_std": 0.3810180738568306, |
|
"rewards/accuracy_reward": 0.6168750144541264, |
|
"rewards/format_reward": 0.9275000095367432, |
|
"step": 1060 |
|
}, |
|
{ |
|
"completion_length": 327.8443794250488, |
|
"epoch": 2.8533333333333335, |
|
"grad_norm": 0.150223970413208, |
|
"kl": 0.1885986328125, |
|
"learning_rate": 2.1810723424204705e-08, |
|
"loss": 0.0075, |
|
"reward": 1.5512500286102295, |
|
"reward_std": 0.4414610244333744, |
|
"rewards/accuracy_reward": 0.6412500124424696, |
|
"rewards/format_reward": 0.9100000098347664, |
|
"step": 1070 |
|
}, |
|
{ |
|
"completion_length": 321.11000747680663, |
|
"epoch": 2.88, |
|
"grad_norm": 0.1418101191520691, |
|
"kl": 0.19056396484375, |
|
"learning_rate": 1.4612301778901604e-08, |
|
"loss": 0.0076, |
|
"reward": 1.511250025033951, |
|
"reward_std": 0.43174309805035593, |
|
"rewards/accuracy_reward": 0.5987500116229058, |
|
"rewards/format_reward": 0.912500011920929, |
|
"step": 1080 |
|
}, |
|
{ |
|
"completion_length": 326.74563217163086, |
|
"epoch": 2.9066666666666667, |
|
"grad_norm": 0.12356416881084442, |
|
"kl": 0.19766845703125, |
|
"learning_rate": 8.845222959868227e-09, |
|
"loss": 0.0079, |
|
"reward": 1.5175000235438347, |
|
"reward_std": 0.394646280631423, |
|
"rewards/accuracy_reward": 0.6131250087171793, |
|
"rewards/format_reward": 0.9043750107288361, |
|
"step": 1090 |
|
}, |
|
{ |
|
"completion_length": 337.4362560272217, |
|
"epoch": 2.9333333333333336, |
|
"grad_norm": 0.16877064108848572, |
|
"kl": 0.18851318359375, |
|
"learning_rate": 4.515044214485842e-09, |
|
"loss": 0.0075, |
|
"reward": 1.5406250208616257, |
|
"reward_std": 0.4307627685368061, |
|
"rewards/accuracy_reward": 0.630625007674098, |
|
"rewards/format_reward": 0.9100000083446502, |
|
"step": 1100 |
|
}, |
|
{ |
|
"completion_length": 334.74000549316406, |
|
"epoch": 2.96, |
|
"grad_norm": 0.14080122113227844, |
|
"kl": 0.1871826171875, |
|
"learning_rate": 1.6259381707432464e-09, |
|
"loss": 0.0075, |
|
"reward": 1.5362500309944154, |
|
"reward_std": 0.4239814583212137, |
|
"rewards/accuracy_reward": 0.6300000049173832, |
|
"rewards/format_reward": 0.9062500104308129, |
|
"step": 1110 |
|
}, |
|
{ |
|
"completion_length": 341.06625747680664, |
|
"epoch": 2.986666666666667, |
|
"grad_norm": 0.09707628190517426, |
|
"kl": 0.1976806640625, |
|
"learning_rate": 1.8068881642691049e-10, |
|
"loss": 0.0079, |
|
"reward": 1.54687502682209, |
|
"reward_std": 0.46240887157619, |
|
"rewards/accuracy_reward": 0.6443750120699405, |
|
"rewards/format_reward": 0.9025000169873237, |
|
"step": 1120 |
|
}, |
|
{ |
|
"completion_length": 330.91750717163086, |
|
"epoch": 3.0, |
|
"kl": 0.17158203125, |
|
"reward": 1.5462500274181366, |
|
"reward_std": 0.43328754380345347, |
|
"rewards/accuracy_reward": 0.6225000083446502, |
|
"rewards/format_reward": 0.9237500160932541, |
|
"step": 1125, |
|
"total_flos": 0.0, |
|
"train_loss": 0.08514851592410155, |
|
"train_runtime": 97826.7372, |
|
"train_samples_per_second": 0.23, |
|
"train_steps_per_second": 0.011 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1125, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|