|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"episode": 7680, |
|
"epoch": 0.10533967931748667, |
|
"eval_steps": 200.0, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"episode": 256, |
|
"epoch": 0.003511322643916222, |
|
"eps": 6, |
|
"loss/policy_avg": -0.07090990990400314, |
|
"loss/value_avg": 0.0, |
|
"lr": 3e-06, |
|
"objective/entropy": 49.42120361328125, |
|
"objective/kl": 0.006465356796979904, |
|
"objective/non_score_reward": -0.000646535714622587, |
|
"objective/rlhf_reward": -1.1137903928756714, |
|
"objective/scores": -1.109375, |
|
"policy/approxkl_avg": 27.096786499023438, |
|
"policy/clipfrac_avg": 0.732421875, |
|
"policy/entropy_avg": 0.92181396484375, |
|
"step": 5, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 12, |
|
"val/ratio": 1.0399832725524902, |
|
"val/ratio_var": 0.010045886039733887 |
|
}, |
|
{ |
|
"episode": 512, |
|
"epoch": 0.007022645287832444, |
|
"eps": 6, |
|
"loss/policy_avg": -0.06497187167406082, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9923273657289e-06, |
|
"objective/entropy": 48.286014556884766, |
|
"objective/kl": 0.8119473457336426, |
|
"objective/non_score_reward": -0.08119472861289978, |
|
"objective/rlhf_reward": -1.266162633895874, |
|
"objective/scores": -1.1875, |
|
"policy/approxkl_avg": 18.666072845458984, |
|
"policy/clipfrac_avg": 0.7314453125, |
|
"policy/entropy_avg": 0.912261962890625, |
|
"step": 10, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.020957112312317, |
|
"val/ratio_var": 0.00411860179156065 |
|
}, |
|
{ |
|
"episode": 768, |
|
"epoch": 0.010533967931748666, |
|
"eps": 6, |
|
"loss/policy_avg": -0.0872286781668663, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9846547314578008e-06, |
|
"objective/entropy": 49.34376525878906, |
|
"objective/kl": 1.9591996669769287, |
|
"objective/non_score_reward": -0.1959199756383896, |
|
"objective/rlhf_reward": -1.2858657836914062, |
|
"objective/scores": -1.09375, |
|
"policy/approxkl_avg": 20.772502899169922, |
|
"policy/clipfrac_avg": 0.73828125, |
|
"policy/entropy_avg": 0.927978515625, |
|
"step": 15, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 12, |
|
"val/ratio": 1.0191609859466553, |
|
"val/ratio_var": 0.00307083735242486 |
|
}, |
|
{ |
|
"episode": 1024, |
|
"epoch": 0.014045290575664887, |
|
"eps": 6, |
|
"loss/policy_avg": -0.07566041499376297, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9769820971867007e-06, |
|
"objective/entropy": 53.13662338256836, |
|
"objective/kl": 2.4811532497406006, |
|
"objective/non_score_reward": -0.24811533093452454, |
|
"objective/rlhf_reward": -1.2548893690109253, |
|
"objective/scores": -1.0078125, |
|
"policy/approxkl_avg": 20.665164947509766, |
|
"policy/clipfrac_avg": 0.7314453125, |
|
"policy/entropy_avg": 0.989776611328125, |
|
"step": 20, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 11, |
|
"val/ratio": 1.011010766029358, |
|
"val/ratio_var": 0.004201602190732956 |
|
}, |
|
{ |
|
"episode": 1280, |
|
"epoch": 0.01755661321958111, |
|
"eps": 6, |
|
"loss/policy_avg": -0.08593496680259705, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9693094629156014e-06, |
|
"objective/entropy": 53.72633743286133, |
|
"objective/kl": 3.3111624717712402, |
|
"objective/non_score_reward": -0.3311161994934082, |
|
"objective/rlhf_reward": -1.339456558227539, |
|
"objective/scores": -1.0078125, |
|
"policy/approxkl_avg": 25.559288024902344, |
|
"policy/clipfrac_avg": 0.7353515625, |
|
"policy/entropy_avg": 0.997894287109375, |
|
"step": 25, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 13, |
|
"val/ratio": 1.0134021043777466, |
|
"val/ratio_var": 0.0019979747012257576 |
|
}, |
|
{ |
|
"episode": 1536, |
|
"epoch": 0.021067935863497332, |
|
"eps": 6, |
|
"loss/policy_avg": -0.09734417498111725, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9616368286445014e-06, |
|
"objective/entropy": 51.259735107421875, |
|
"objective/kl": 5.089182376861572, |
|
"objective/non_score_reward": -0.5089181661605835, |
|
"objective/rlhf_reward": -1.2202520370483398, |
|
"objective/scores": -0.7109375, |
|
"policy/approxkl_avg": 29.841636657714844, |
|
"policy/clipfrac_avg": 0.736328125, |
|
"policy/entropy_avg": 0.960479736328125, |
|
"step": 30, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 26, |
|
"val/ratio": 1.0178756713867188, |
|
"val/ratio_var": 0.009866585955023766 |
|
}, |
|
{ |
|
"episode": 1792, |
|
"epoch": 0.024579258507413555, |
|
"eps": 6, |
|
"loss/policy_avg": -0.06831618398427963, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9539641943734013e-06, |
|
"objective/entropy": 40.643272399902344, |
|
"objective/kl": 6.974010944366455, |
|
"objective/non_score_reward": -0.6974011063575745, |
|
"objective/rlhf_reward": -1.2684605121612549, |
|
"objective/scores": -0.5703125, |
|
"policy/approxkl_avg": 35.33942413330078, |
|
"policy/clipfrac_avg": 0.6982421875, |
|
"policy/entropy_avg": 0.7505035400390625, |
|
"step": 35, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.00449800491333, |
|
"val/ratio_var": 0.0022142010275274515 |
|
}, |
|
{ |
|
"episode": 2048, |
|
"epoch": 0.028090581151329775, |
|
"eps": 6, |
|
"loss/policy_avg": -0.04068079590797424, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.946291560102302e-06, |
|
"objective/entropy": 23.142562866210938, |
|
"objective/kl": 8.180486679077148, |
|
"objective/non_score_reward": -0.8180487155914307, |
|
"objective/rlhf_reward": -1.0729957818984985, |
|
"objective/scores": -0.255859375, |
|
"policy/approxkl_avg": 23.68307876586914, |
|
"policy/clipfrac_avg": 0.5859375, |
|
"policy/entropy_avg": 0.4361400604248047, |
|
"step": 40, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 8, |
|
"val/ratio": 1.0077030658721924, |
|
"val/ratio_var": 0.0024766812566667795 |
|
}, |
|
{ |
|
"episode": 2304, |
|
"epoch": 0.031601903795246, |
|
"eps": 6, |
|
"loss/policy_avg": -0.07307010889053345, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.938618925831202e-06, |
|
"objective/entropy": 19.376842498779297, |
|
"objective/kl": 8.770210266113281, |
|
"objective/non_score_reward": -0.8770210146903992, |
|
"objective/rlhf_reward": -1.0002652406692505, |
|
"objective/scores": -0.12353515625, |
|
"policy/approxkl_avg": 31.00873565673828, |
|
"policy/clipfrac_avg": 0.5302734375, |
|
"policy/entropy_avg": 0.33237457275390625, |
|
"step": 45, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 0.996111273765564, |
|
"val/ratio_var": 0.001100091845728457 |
|
}, |
|
{ |
|
"episode": 2560, |
|
"epoch": 0.03511322643916222, |
|
"eps": 6, |
|
"loss/policy_avg": -0.04584116116166115, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9309462915601027e-06, |
|
"objective/entropy": 11.984097480773926, |
|
"objective/kl": 8.4966402053833, |
|
"objective/non_score_reward": -0.849664032459259, |
|
"objective/rlhf_reward": -0.8017911911010742, |
|
"objective/scores": 0.0478515625, |
|
"policy/approxkl_avg": 22.561037063598633, |
|
"policy/clipfrac_avg": 0.451171875, |
|
"policy/entropy_avg": 0.19393539428710938, |
|
"step": 50, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 0.9952375888824463, |
|
"val/ratio_var": 0.000761833623982966 |
|
}, |
|
{ |
|
"episode": 2816, |
|
"epoch": 0.03862454908307844, |
|
"eps": 5, |
|
"loss/policy_avg": -0.029720915481448174, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9232736572890026e-06, |
|
"objective/entropy": 4.9489898681640625, |
|
"objective/kl": 8.733837127685547, |
|
"objective/non_score_reward": -0.8733837604522705, |
|
"objective/rlhf_reward": -0.7492713928222656, |
|
"objective/scores": 0.1240234375, |
|
"policy/approxkl_avg": 16.253189086914062, |
|
"policy/clipfrac_avg": 0.341796875, |
|
"policy/entropy_avg": 0.07728099822998047, |
|
"step": 55, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 18, |
|
"val/ratio": 0.9972053170204163, |
|
"val/ratio_var": 0.00032430028659291565 |
|
}, |
|
{ |
|
"episode": 3072, |
|
"epoch": 0.042135871726994664, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01298562902957201, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9156010230179026e-06, |
|
"objective/entropy": 1.3101667165756226, |
|
"objective/kl": 8.699792861938477, |
|
"objective/non_score_reward": -0.8699792623519897, |
|
"objective/rlhf_reward": -0.5752952098846436, |
|
"objective/scores": 0.294921875, |
|
"policy/approxkl_avg": 2.27925968170166, |
|
"policy/clipfrac_avg": 0.236328125, |
|
"policy/entropy_avg": 0.02513742446899414, |
|
"step": 60, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 1.0017118453979492, |
|
"val/ratio_var": 0.00016639505338389426 |
|
}, |
|
{ |
|
"episode": 3328, |
|
"epoch": 0.04564719437091089, |
|
"eps": 5, |
|
"loss/policy_avg": -0.02618303708732128, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9079283887468033e-06, |
|
"objective/entropy": 2.3685269355773926, |
|
"objective/kl": 9.208517074584961, |
|
"objective/non_score_reward": -0.9208516478538513, |
|
"objective/rlhf_reward": -0.5182289481163025, |
|
"objective/scores": 0.40234375, |
|
"policy/approxkl_avg": 2.6189699172973633, |
|
"policy/clipfrac_avg": 0.310546875, |
|
"policy/entropy_avg": 0.04020071029663086, |
|
"step": 65, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 1.003983497619629, |
|
"val/ratio_var": 0.0009448421187698841 |
|
}, |
|
{ |
|
"episode": 3584, |
|
"epoch": 0.04915851701482711, |
|
"eps": 5, |
|
"loss/policy_avg": -0.02327096462249756, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9002557544757032e-06, |
|
"objective/entropy": 2.0416018962860107, |
|
"objective/kl": 9.701976776123047, |
|
"objective/non_score_reward": -0.9701976776123047, |
|
"objective/rlhf_reward": -0.49486449360847473, |
|
"objective/scores": 0.474609375, |
|
"policy/approxkl_avg": 1.271956443786621, |
|
"policy/clipfrac_avg": 0.2734375, |
|
"policy/entropy_avg": 0.041253089904785156, |
|
"step": 70, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.0039558410644531, |
|
"val/ratio_var": 0.00041477559716440737 |
|
}, |
|
{ |
|
"episode": 3840, |
|
"epoch": 0.052669839658743334, |
|
"eps": 5, |
|
"loss/policy_avg": -0.033096276223659515, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.892583120204604e-06, |
|
"objective/entropy": 2.7795495986938477, |
|
"objective/kl": 10.028523445129395, |
|
"objective/non_score_reward": -1.0028523206710815, |
|
"objective/rlhf_reward": -0.46555712819099426, |
|
"objective/scores": 0.5390625, |
|
"policy/approxkl_avg": 3.055203676223755, |
|
"policy/clipfrac_avg": 0.3427734375, |
|
"policy/entropy_avg": 0.053270816802978516, |
|
"step": 75, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 23, |
|
"val/ratio": 1.0012407302856445, |
|
"val/ratio_var": 0.00011274257121840492 |
|
}, |
|
{ |
|
"episode": 4096, |
|
"epoch": 0.05618116230265955, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01961323618888855, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.884910485933504e-06, |
|
"objective/entropy": 2.5525641441345215, |
|
"objective/kl": 10.111019134521484, |
|
"objective/non_score_reward": -1.0111019611358643, |
|
"objective/rlhf_reward": -0.510233461856842, |
|
"objective/scores": 0.5, |
|
"policy/approxkl_avg": 1.331697940826416, |
|
"policy/clipfrac_avg": 0.2861328125, |
|
"policy/entropy_avg": 0.048857688903808594, |
|
"step": 80, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 25, |
|
"val/ratio": 1.011049509048462, |
|
"val/ratio_var": 0.004252108279615641 |
|
}, |
|
{ |
|
"episode": 4352, |
|
"epoch": 0.05969248494657577, |
|
"eps": 5, |
|
"loss/policy_avg": -0.009127877652645111, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.877237851662404e-06, |
|
"objective/entropy": 3.016789674758911, |
|
"objective/kl": 11.257818222045898, |
|
"objective/non_score_reward": -1.125781774520874, |
|
"objective/rlhf_reward": -0.4276960492134094, |
|
"objective/scores": 0.69921875, |
|
"policy/approxkl_avg": 1.4772686958312988, |
|
"policy/clipfrac_avg": 0.35546875, |
|
"policy/entropy_avg": 0.053719520568847656, |
|
"step": 85, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 6, |
|
"val/ratio": 1.0042904615402222, |
|
"val/ratio_var": 0.0008556774700991809 |
|
}, |
|
{ |
|
"episode": 4608, |
|
"epoch": 0.063203807590492, |
|
"eps": 5, |
|
"loss/policy_avg": -0.025049656629562378, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8695652173913046e-06, |
|
"objective/entropy": 2.5907459259033203, |
|
"objective/kl": 10.457273483276367, |
|
"objective/non_score_reward": -1.0457274913787842, |
|
"objective/rlhf_reward": -0.3816419839859009, |
|
"objective/scores": 0.6640625, |
|
"policy/approxkl_avg": 2.3460922241210938, |
|
"policy/clipfrac_avg": 0.322265625, |
|
"policy/entropy_avg": 0.04626178741455078, |
|
"step": 90, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 11, |
|
"val/ratio": 1.0003862380981445, |
|
"val/ratio_var": 7.93520302977413e-05 |
|
}, |
|
{ |
|
"episode": 4864, |
|
"epoch": 0.06671513023440821, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01828361675143242, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8618925831202045e-06, |
|
"objective/entropy": 2.397810220718384, |
|
"objective/kl": 10.732559204101562, |
|
"objective/non_score_reward": -1.073256015777588, |
|
"objective/rlhf_reward": -0.35966813564300537, |
|
"objective/scores": 0.71484375, |
|
"policy/approxkl_avg": 1.1093428134918213, |
|
"policy/clipfrac_avg": 0.32421875, |
|
"policy/entropy_avg": 0.041881561279296875, |
|
"step": 95, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 15, |
|
"val/ratio": 1.0054664611816406, |
|
"val/ratio_var": 0.0017973663052543998 |
|
}, |
|
{ |
|
"episode": 5120, |
|
"epoch": 0.07022645287832444, |
|
"eps": 5, |
|
"loss/policy_avg": -0.04088423401117325, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8542199488491053e-06, |
|
"objective/entropy": 2.343449592590332, |
|
"objective/kl": 11.780994415283203, |
|
"objective/non_score_reward": -1.1780993938446045, |
|
"objective/rlhf_reward": -0.4628324806690216, |
|
"objective/scores": 0.71484375, |
|
"policy/approxkl_avg": 0.894420325756073, |
|
"policy/clipfrac_avg": 0.46875, |
|
"policy/entropy_avg": 0.04486083984375, |
|
"step": 100, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 11, |
|
"val/ratio": 1.0009559392929077, |
|
"val/ratio_var": 4.804596756002866e-05 |
|
}, |
|
{ |
|
"episode": 5376, |
|
"epoch": 0.07373777552224066, |
|
"eps": 5, |
|
"loss/policy_avg": -0.020697183907032013, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.846547314578005e-06, |
|
"objective/entropy": 1.9023351669311523, |
|
"objective/kl": 10.29288101196289, |
|
"objective/non_score_reward": -1.0292882919311523, |
|
"objective/rlhf_reward": -0.29047834873199463, |
|
"objective/scores": 0.73828125, |
|
"policy/approxkl_avg": 0.9143690466880798, |
|
"policy/clipfrac_avg": 0.373046875, |
|
"policy/entropy_avg": 0.028568267822265625, |
|
"step": 105, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 10, |
|
"val/ratio": 1.000715732574463, |
|
"val/ratio_var": 4.201457340968773e-05 |
|
}, |
|
{ |
|
"episode": 5632, |
|
"epoch": 0.07724909816615688, |
|
"eps": 5, |
|
"loss/policy_avg": -0.012633640319108963, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8388746803069055e-06, |
|
"objective/entropy": 1.3839142322540283, |
|
"objective/kl": 10.57151985168457, |
|
"objective/non_score_reward": -1.0571520328521729, |
|
"objective/rlhf_reward": -0.2935946583747864, |
|
"objective/scores": 0.765625, |
|
"policy/approxkl_avg": 0.6525547504425049, |
|
"policy/clipfrac_avg": 0.2646484375, |
|
"policy/entropy_avg": 0.0345916748046875, |
|
"step": 110, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 10, |
|
"val/ratio": 0.9999199509620667, |
|
"val/ratio_var": 2.6978697860613465e-05 |
|
}, |
|
{ |
|
"episode": 5888, |
|
"epoch": 0.0807604208100731, |
|
"eps": 5, |
|
"loss/policy_avg": -0.026668714359402657, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.831202046035806e-06, |
|
"objective/entropy": 2.17741322517395, |
|
"objective/kl": 11.39688491821289, |
|
"objective/non_score_reward": -1.139688491821289, |
|
"objective/rlhf_reward": -0.3027456998825073, |
|
"objective/scores": 0.8359375, |
|
"policy/approxkl_avg": 8.829752922058105, |
|
"policy/clipfrac_avg": 0.35546875, |
|
"policy/entropy_avg": 0.034277915954589844, |
|
"step": 115, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 8, |
|
"val/ratio": 1.0012441873550415, |
|
"val/ratio_var": 9.009366476675496e-05 |
|
}, |
|
{ |
|
"episode": 6144, |
|
"epoch": 0.08427174345398933, |
|
"eps": 5, |
|
"loss/policy_avg": -0.011602860875427723, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.823529411764706e-06, |
|
"objective/entropy": 1.418602466583252, |
|
"objective/kl": 10.246469497680664, |
|
"objective/non_score_reward": -1.0246469974517822, |
|
"objective/rlhf_reward": -0.22599510848522186, |
|
"objective/scores": 0.796875, |
|
"policy/approxkl_avg": 0.31790149211883545, |
|
"policy/clipfrac_avg": 0.2314453125, |
|
"policy/entropy_avg": 0.028847694396972656, |
|
"step": 120, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 9, |
|
"val/ratio": 1.0009679794311523, |
|
"val/ratio_var": 3.900106457876973e-05 |
|
}, |
|
{ |
|
"episode": 6400, |
|
"epoch": 0.08778306609790555, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0157505851238966, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8158567774936066e-06, |
|
"objective/entropy": 1.936393141746521, |
|
"objective/kl": 10.550077438354492, |
|
"objective/non_score_reward": -1.0550076961517334, |
|
"objective/rlhf_reward": -0.252943217754364, |
|
"objective/scores": 0.80078125, |
|
"policy/approxkl_avg": 6.545133113861084, |
|
"policy/clipfrac_avg": 0.341796875, |
|
"policy/entropy_avg": 0.039971351623535156, |
|
"step": 125, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 12, |
|
"val/ratio": 1.0001187324523926, |
|
"val/ratio_var": 0.00011527155584190041 |
|
}, |
|
{ |
|
"episode": 6656, |
|
"epoch": 0.09129438874182177, |
|
"eps": 5, |
|
"loss/policy_avg": -0.00908716581761837, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8081841432225065e-06, |
|
"objective/entropy": 1.9167767763137817, |
|
"objective/kl": 10.831771850585938, |
|
"objective/non_score_reward": -1.0831772089004517, |
|
"objective/rlhf_reward": -0.24270595610141754, |
|
"objective/scores": 0.83984375, |
|
"policy/approxkl_avg": 13.507976531982422, |
|
"policy/clipfrac_avg": 0.25, |
|
"policy/entropy_avg": 0.034499168395996094, |
|
"step": 130, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 7, |
|
"val/ratio": 1.0004911422729492, |
|
"val/ratio_var": 0.00018595268193166703 |
|
}, |
|
{ |
|
"episode": 6912, |
|
"epoch": 0.094805711385738, |
|
"eps": 5, |
|
"loss/policy_avg": -0.017197387292981148, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.800511508951407e-06, |
|
"objective/entropy": 1.7237651348114014, |
|
"objective/kl": 11.095592498779297, |
|
"objective/non_score_reward": -1.1095592975616455, |
|
"objective/rlhf_reward": -0.21057555079460144, |
|
"objective/scores": 0.8984375, |
|
"policy/approxkl_avg": 2.7560040950775146, |
|
"policy/clipfrac_avg": 0.2841796875, |
|
"policy/entropy_avg": 0.032952308654785156, |
|
"step": 135, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 2, |
|
"val/ratio": 0.9994020462036133, |
|
"val/ratio_var": 3.074964843108319e-05 |
|
}, |
|
{ |
|
"episode": 7168, |
|
"epoch": 0.09831703402965422, |
|
"eps": 5, |
|
"loss/policy_avg": -0.012010859325528145, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.792838874680307e-06, |
|
"objective/entropy": 1.5862581729888916, |
|
"objective/kl": 10.674396514892578, |
|
"objective/non_score_reward": -1.0674396753311157, |
|
"objective/rlhf_reward": -0.14433012902736664, |
|
"objective/scores": 0.921875, |
|
"policy/approxkl_avg": 1.1186727285385132, |
|
"policy/clipfrac_avg": 0.2783203125, |
|
"policy/entropy_avg": 0.0295562744140625, |
|
"step": 140, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 13, |
|
"val/ratio": 1.0007727146148682, |
|
"val/ratio_var": 4.557183274300769e-05 |
|
}, |
|
{ |
|
"episode": 7424, |
|
"epoch": 0.10182835667357044, |
|
"eps": 5, |
|
"loss/policy_avg": -0.013728385791182518, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.785166240409207e-06, |
|
"objective/entropy": 1.5388869047164917, |
|
"objective/kl": 10.359582901000977, |
|
"objective/non_score_reward": -1.035958170890808, |
|
"objective/rlhf_reward": -0.14511710405349731, |
|
"objective/scores": 0.890625, |
|
"policy/approxkl_avg": 0.5204602479934692, |
|
"policy/clipfrac_avg": 0.283203125, |
|
"policy/entropy_avg": 0.028924942016601562, |
|
"step": 145, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 14, |
|
"val/ratio": 1.056097149848938, |
|
"val/ratio_var": 0.13372056186199188 |
|
}, |
|
{ |
|
"episode": 7680, |
|
"epoch": 0.10533967931748667, |
|
"eps": 5, |
|
"loss/policy_avg": -0.014945434406399727, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.7774936061381074e-06, |
|
"objective/entropy": 2.0769755840301514, |
|
"objective/kl": 11.147063255310059, |
|
"objective/non_score_reward": -1.11470627784729, |
|
"objective/rlhf_reward": -0.08940108120441437, |
|
"objective/scores": 1.0234375, |
|
"policy/approxkl_avg": 0.5961493253707886, |
|
"policy/clipfrac_avg": 0.3681640625, |
|
"policy/entropy_avg": 0.037804603576660156, |
|
"step": 150, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 13, |
|
"val/ratio": 1.0033739805221558, |
|
"val/ratio_var": 0.00030022990540601313 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 391, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1.3716104077797742, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0, |
|
"train_batch_size": null, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|