{ "best_metric": null, "best_model_checkpoint": null, "episode": 32000, "epoch": 0.5751878347772945, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "episode": 16, "epoch": 0.00028759391738864725, "loss/policy_avg": 0.04147649183869362, "lr": 1e-05, "objective/entropy": 119.65733337402344, "objective/kl": 15.623376846313477, "objective/non_score_reward": -1.5623377561569214, "objective/rlhf_reward": -3.849351099133491, "objective/scores": 0.6, "policy/approxkl_avg": 473.7090759277344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7531497478485107, "step": 0, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990334510803223 }, { "episode": 32, "epoch": 0.0005751878347772945, "loss/policy_avg": 0.09634321182966232, "lr": 9.999360940695298e-06, "objective/entropy": -24.297130584716797, "objective/kl": 11.720248222351074, "objective/non_score_reward": -1.1720247268676758, "objective/rlhf_reward": -3.2880991645157334, "objective/scores": 0.35, "policy/approxkl_avg": 233.3876953125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6364185214042664, "step": 1, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988316297531128 }, { "episode": 48, "epoch": 0.0008627817521659417, "loss/policy_avg": 0.5879926681518555, "lr": 9.998721881390595e-06, "objective/entropy": -123.47531127929688, "objective/kl": 7.935818672180176, "objective/non_score_reward": -0.7935818433761597, "objective/rlhf_reward": -0.25060838157055054, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 142.57273864746094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6163707971572876, "step": 2, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999882459640503 }, { "episode": 64, "epoch": 0.001150375669554589, "loss/policy_avg": 0.380592405796051, "lr": 9.99808282208589e-06, "objective/entropy": -117.48745727539062, "objective/kl": 10.153940200805664, "objective/non_score_reward": -1.0153939723968506, "objective/rlhf_reward": -2.682973676411015, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 190.00497436523438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5329767465591431, "step": 3, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998424053192139 }, { "episode": 80, "epoch": 0.001437969586943236, "loss/policy_avg": 0.14582836627960205, "lr": 9.997443762781187e-06, "objective/entropy": -217.63848876953125, "objective/kl": 10.502876281738281, "objective/non_score_reward": -1.0502876043319702, "objective/rlhf_reward": -2.777318381418554, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 221.2613067626953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6339143514633179, "step": 4, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9996079206466675 }, { "episode": 96, "epoch": 0.0017255635043318834, "loss/policy_avg": 0.12740007042884827, "lr": 9.996804703476484e-06, "objective/entropy": 398.1901550292969, "objective/kl": 14.20137882232666, "objective/non_score_reward": -1.420137882232666, "objective/rlhf_reward": -3.28055148422718, "objective/scores": 0.6, "policy/approxkl_avg": 349.208740234375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 1.005652904510498, "step": 5, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9985274076461792 }, { "episode": 112, "epoch": 0.0020131574217205307, "loss/policy_avg": 0.1509546935558319, "lr": 9.99616564417178e-06, "objective/entropy": -124.58861541748047, "objective/kl": 8.397514343261719, "objective/non_score_reward": -0.8397514224052429, "objective/rlhf_reward": -1.9590056151151658, "objective/scores": 0.35, "policy/approxkl_avg": 87.99980163574219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7696092128753662, "step": 6, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9961457252502441 }, { "episode": 128, "epoch": 0.002300751339109178, "loss/policy_avg": 0.07236729562282562, "lr": 9.995526584867077e-06, "objective/entropy": -62.749176025390625, "objective/kl": 10.19581413269043, "objective/non_score_reward": -1.0195814371109009, "objective/rlhf_reward": -2.3449923555056253, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 151.23446655273438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7507286071777344, "step": 7, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975416660308838 }, { "episode": 144, "epoch": 0.002588345256497825, "loss/policy_avg": 0.1384029984474182, "lr": 9.994887525562374e-06, "objective/entropy": -143.49945068359375, "objective/kl": 12.088400840759277, "objective/non_score_reward": -1.2088401317596436, "objective/rlhf_reward": -3.435360452532768, "objective/scores": 0.35, "policy/approxkl_avg": 150.72146606445312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7477531433105469, "step": 8, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000879764556885 }, { "episode": 160, "epoch": 0.002875939173886472, "loss/policy_avg": -0.009389623999595642, "lr": 9.99424846625767e-06, "objective/entropy": -8.538755416870117, "objective/kl": 4.930829048156738, "objective/non_score_reward": -0.4930829405784607, "objective/rlhf_reward": -0.6306961386496122, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 21.575889587402344, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4435485005378723, "step": 9, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00315523147583 }, { "episode": 176, "epoch": 0.0031635330912751195, "loss/policy_avg": 0.09865772724151611, "lr": 9.993609406952966e-06, "objective/entropy": -17.656417846679688, "objective/kl": 7.901223659515381, "objective/non_score_reward": -0.790122389793396, "objective/rlhf_reward": -1.213078211026128, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 61.98566436767578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7360714673995972, "step": 10, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977571964263916 }, { "episode": 192, "epoch": 0.0034511270086637668, "loss/policy_avg": -0.005021991208195686, "lr": 9.992970347648263e-06, "objective/entropy": -36.69260787963867, "objective/kl": 10.859649658203125, "objective/non_score_reward": -1.0859650373458862, "objective/rlhf_reward": -1.9438601382076737, "objective/scores": 0.6, "policy/approxkl_avg": 145.91165161132812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4879041314125061, "step": 11, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994498491287231 }, { "episode": 208, "epoch": 0.003738720926052414, "loss/policy_avg": 0.35356682538986206, "lr": 9.992331288343558e-06, "objective/entropy": -69.72517395019531, "objective/kl": 10.624967575073242, "objective/non_score_reward": -1.0624967813491821, "objective/rlhf_reward": -2.6937278797298223, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 142.52261352539062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6256821751594543, "step": 12, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9967740774154663 }, { "episode": 224, "epoch": 0.004026314843441061, "loss/policy_avg": 0.24467170238494873, "lr": 9.991692229038855e-06, "objective/entropy": -115.99034881591797, "objective/kl": 11.337324142456055, "objective/non_score_reward": -1.1337324380874634, "objective/rlhf_reward": -2.8730703942185505, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 95.43186950683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5122163891792297, "step": 13, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000202178955078 }, { "episode": 240, "epoch": 0.004313908760829708, "loss/policy_avg": 0.36638143658638, "lr": 9.991053169734152e-06, "objective/entropy": 90.19092559814453, "objective/kl": 8.482120513916016, "objective/non_score_reward": -0.8482120633125305, "objective/rlhf_reward": -1.5680194973674526, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 103.84627532958984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.45586448907852173, "step": 14, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0012996196746826 }, { "episode": 256, "epoch": 0.004601502678218356, "loss/policy_avg": 0.3564397394657135, "lr": 9.990414110429449e-06, "objective/entropy": 62.88275146484375, "objective/kl": 8.093853950500488, "objective/non_score_reward": -0.8093854188919067, "objective/rlhf_reward": -1.7565889535502193, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 129.63275146484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5616360902786255, "step": 15, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000332832336426 }, { "episode": 272, "epoch": 0.004889096595607003, "loss/policy_avg": 0.731740415096283, "lr": 9.989775051124744e-06, "objective/entropy": 175.25027465820312, "objective/kl": 13.653030395507812, "objective/non_score_reward": -1.3653030395507812, "objective/rlhf_reward": -4.037380088766185, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 197.69329833984375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6656568050384521, "step": 16, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004138946533203 }, { "episode": 288, "epoch": 0.00517669051299565, "loss/policy_avg": 0.0038209843914955854, "lr": 9.989135991820041e-06, "objective/entropy": 166.37741088867188, "objective/kl": 11.93104362487793, "objective/non_score_reward": -1.1931045055389404, "objective/rlhf_reward": -3.3724178135395046, "objective/scores": 0.35, "policy/approxkl_avg": 123.38684844970703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4346945881843567, "step": 17, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.002516269683838 }, { "episode": 304, "epoch": 0.0054642844303842975, "loss/policy_avg": 0.5328235626220703, "lr": 9.988496932515338e-06, "objective/entropy": -59.579795837402344, "objective/kl": 14.574970245361328, "objective/non_score_reward": -1.457497000694275, "objective/rlhf_reward": -4.273728727307871, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 107.66255187988281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6419472098350525, "step": 18, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002565383911133 }, { "episode": 320, "epoch": 0.005751878347772944, "loss/policy_avg": 0.1068505123257637, "lr": 9.987857873210635e-06, "objective/entropy": 25.82529067993164, "objective/kl": 7.757124900817871, "objective/non_score_reward": -0.7757124900817871, "objective/rlhf_reward": -1.702849841117859, "objective/scores": 0.35, "policy/approxkl_avg": 35.83104705810547, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.37679600715637207, "step": 19, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0017640590667725 }, { "episode": 336, "epoch": 0.006039472265161592, "loss/policy_avg": 0.9153174757957458, "lr": 9.987218813905932e-06, "objective/entropy": 123.23423767089844, "objective/kl": 15.62867546081543, "objective/non_score_reward": -1.5628674030303955, "objective/rlhf_reward": -4.770517232830882, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 175.58567810058594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6862951517105103, "step": 20, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9951945543289185 }, { "episode": 352, "epoch": 0.006327066182550239, "loss/policy_avg": 0.13535380363464355, "lr": 9.986579754601228e-06, "objective/entropy": 106.94303894042969, "objective/kl": 14.264102935791016, "objective/non_score_reward": -1.42641019821167, "objective/rlhf_reward": -2.781921450735304, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 203.86151123046875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4334957003593445, "step": 21, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984745979309082 }, { "episode": 368, "epoch": 0.006614660099938887, "loss/policy_avg": 0.08913514018058777, "lr": 9.985940695296524e-06, "objective/entropy": 86.8988037109375, "objective/kl": 14.969903945922852, "objective/non_score_reward": -1.4969902038574219, "objective/rlhf_reward": -4.564128805597392, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 204.34201049804688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6533622741699219, "step": 22, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993085861206055 }, { "episode": 384, "epoch": 0.0069022540173275335, "loss/policy_avg": 0.4681934416294098, "lr": 9.98530163599182e-06, "objective/entropy": -86.89934539794922, "objective/kl": 17.868688583374023, "objective/non_score_reward": -1.7868685722351074, "objective/rlhf_reward": -5.821961793929262, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 180.03530883789062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6652738451957703, "step": 23, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9969769716262817 }, { "episode": 400, "epoch": 0.00718984793471618, "loss/policy_avg": 0.05787897855043411, "lr": 9.984662576687117e-06, "objective/entropy": 217.01751708984375, "objective/kl": 7.942338466644287, "objective/non_score_reward": -0.7942339181900024, "objective/rlhf_reward": -1.7769354641437531, "objective/scores": 0.35, "policy/approxkl_avg": 14.617660522460938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7284016609191895, "step": 24, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0007760524749756 }, { "episode": 416, "epoch": 0.007477441852104828, "loss/policy_avg": 0.17751406133174896, "lr": 9.984023517382414e-06, "objective/entropy": 79.38223266601562, "objective/kl": 13.876078605651855, "objective/non_score_reward": -1.3876079320907593, "objective/rlhf_reward": -4.191181921695156, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 129.87246704101562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9167462587356567, "step": 25, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985355138778687 }, { "episode": 432, "epoch": 0.007765035769493475, "loss/policy_avg": 0.529009222984314, "lr": 9.983384458077711e-06, "objective/entropy": -31.18558120727539, "objective/kl": 14.786969184875488, "objective/non_score_reward": -1.4786969423294067, "objective/rlhf_reward": -4.5361854816354334, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 125.92539978027344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.43768489360809326, "step": 26, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984157085418701 }, { "episode": 448, "epoch": 0.008052629686882123, "loss/policy_avg": 0.3665599822998047, "lr": 9.982745398773006e-06, "objective/entropy": 23.827144622802734, "objective/kl": 13.60982894897461, "objective/non_score_reward": -1.360982894897461, "objective/rlhf_reward": -5.443931698799133, "objective/scores": 0.0, "policy/approxkl_avg": 127.14844512939453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5104779601097107, "step": 27, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987092018127441 }, { "episode": 464, "epoch": 0.00834022360427077, "loss/policy_avg": 0.32786238193511963, "lr": 9.982106339468303e-06, "objective/entropy": 128.9566650390625, "objective/kl": 11.556554794311523, "objective/non_score_reward": -1.1556555032730103, "objective/rlhf_reward": -3.297109279662294, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 59.29738998413086, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4545682668685913, "step": 28, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0006556510925293 }, { "episode": 480, "epoch": 0.008627817521659416, "loss/policy_avg": 0.2694750428199768, "lr": 9.9814672801636e-06, "objective/entropy": 78.4908447265625, "objective/kl": 9.683059692382812, "objective/non_score_reward": -0.9683058857917786, "objective/rlhf_reward": -2.5139735278829765, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 102.18389892578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.659131646156311, "step": 29, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980244636535645 }, { "episode": 496, "epoch": 0.008915411439048063, "loss/policy_avg": -0.2861338257789612, "lr": 9.980828220858897e-06, "objective/entropy": -90.27975463867188, "objective/kl": 7.361126899719238, "objective/non_score_reward": -0.7361127138137817, "objective/rlhf_reward": -1.1196221962300053, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 76.95925903320312, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5377018451690674, "step": 30, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.006430149078369 }, { "episode": 512, "epoch": 0.009203005356436712, "loss/policy_avg": 0.1336214542388916, "lr": 9.980189161554194e-06, "objective/entropy": 153.1845703125, "objective/kl": 12.326415061950684, "objective/non_score_reward": -1.232641577720642, "objective/rlhf_reward": -3.5713163701042365, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 140.37075805664062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7196662425994873, "step": 31, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995383024215698 }, { "episode": 528, "epoch": 0.009490599273825359, "loss/policy_avg": -0.03590531647205353, "lr": 9.97955010224949e-06, "objective/entropy": -60.39399719238281, "objective/kl": 7.551569938659668, "objective/non_score_reward": -0.7551569938659668, "objective/rlhf_reward": -1.6613781240925025, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 36.98230743408203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.617447018623352, "step": 32, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0055394172668457 }, { "episode": 544, "epoch": 0.009778193191214006, "loss/policy_avg": 0.23507678508758545, "lr": 9.978911042944786e-06, "objective/entropy": -62.405269622802734, "objective/kl": 12.254663467407227, "objective/non_score_reward": -1.2254663705825806, "objective/rlhf_reward": -3.386093669923481, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 28.730735778808594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5197240114212036, "step": 33, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973214864730835 }, { "episode": 560, "epoch": 0.010065787108602653, "loss/policy_avg": 0.16142824292182922, "lr": 9.978271983640083e-06, "objective/entropy": 63.909202575683594, "objective/kl": 11.40770149230957, "objective/non_score_reward": -1.1407701969146729, "objective/rlhf_reward": -1.6393620713960855, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 103.13188171386719, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6455787420272827, "step": 34, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994690418243408 }, { "episode": 576, "epoch": 0.0103533810259913, "loss/policy_avg": 0.10174459218978882, "lr": 9.977632924335378e-06, "objective/entropy": -30.112831115722656, "objective/kl": 17.954376220703125, "objective/non_score_reward": -1.7954376935958862, "objective/rlhf_reward": -5.059044542089973, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 330.9220886230469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.519372820854187, "step": 35, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9978516101837158 }, { "episode": 592, "epoch": 0.010640974943379948, "loss/policy_avg": 0.47705915570259094, "lr": 9.976993865030675e-06, "objective/entropy": 302.72314453125, "objective/kl": 19.512754440307617, "objective/non_score_reward": -1.9512755870819092, "objective/rlhf_reward": -3.4051021099090573, "objective/scores": 1.1, "policy/approxkl_avg": 104.70938110351562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8252858519554138, "step": 36, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983307123184204 }, { "episode": 608, "epoch": 0.010928568860768595, "loss/policy_avg": 0.3472205400466919, "lr": 9.976354805725972e-06, "objective/entropy": -59.4378662109375, "objective/kl": 10.388540267944336, "objective/non_score_reward": -1.0388540029525757, "objective/rlhf_reward": -2.3305873229828586, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.55358123779297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6380844712257385, "step": 37, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981659650802612 }, { "episode": 624, "epoch": 0.011216162778157242, "loss/policy_avg": 0.44485026597976685, "lr": 9.975715746421269e-06, "objective/entropy": 76.74449157714844, "objective/kl": 10.349222183227539, "objective/non_score_reward": -1.0349223613739014, "objective/rlhf_reward": -4.139689266681671, "objective/scores": 0.0, "policy/approxkl_avg": 78.09274291992188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6423808336257935, "step": 38, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9971048831939697 }, { "episode": 640, "epoch": 0.011503756695545889, "loss/policy_avg": 0.37319111824035645, "lr": 9.975076687116566e-06, "objective/entropy": -67.30467224121094, "objective/kl": 19.358768463134766, "objective/non_score_reward": -1.9358769655227661, "objective/rlhf_reward": -6.227735841067966, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 161.26229858398438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6262432336807251, "step": 39, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987332820892334 }, { "episode": 656, "epoch": 0.011791350612934537, "loss/policy_avg": 0.389024943113327, "lr": 9.97443762781186e-06, "objective/entropy": 210.994384765625, "objective/kl": 11.99485969543457, "objective/non_score_reward": -1.1994858980178833, "objective/rlhf_reward": -4.797943651676178, "objective/scores": 0.0, "policy/approxkl_avg": 79.62628173828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7361236810684204, "step": 40, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987305402755737 }, { "episode": 672, "epoch": 0.012078944530323184, "loss/policy_avg": 0.4818825125694275, "lr": 9.973798568507158e-06, "objective/entropy": 280.91552734375, "objective/kl": 17.216154098510742, "objective/non_score_reward": -1.7216153144836426, "objective/rlhf_reward": -5.435863117785797, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 82.88700866699219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7085442543029785, "step": 41, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000746488571167 }, { "episode": 688, "epoch": 0.012366538447711831, "loss/policy_avg": 0.7192404270172119, "lr": 9.973159509202454e-06, "objective/entropy": 89.66543579101562, "objective/kl": 12.255132675170898, "objective/non_score_reward": -1.2255134582519531, "objective/rlhf_reward": -0.5020534753799435, "objective/scores": 1.1, "policy/approxkl_avg": 79.93511199951172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5533976554870605, "step": 42, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999737739562988 }, { "episode": 704, "epoch": 0.012654132365100478, "loss/policy_avg": 0.0401420071721077, "lr": 9.972520449897751e-06, "objective/entropy": 175.6131591796875, "objective/kl": 12.72716999053955, "objective/non_score_reward": -1.272716999053955, "objective/rlhf_reward": -2.167148996831152, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 138.8005828857422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8866395354270935, "step": 43, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9959650039672852 }, { "episode": 720, "epoch": 0.012941726282489125, "loss/policy_avg": 0.5428536534309387, "lr": 9.971881390593048e-06, "objective/entropy": 122.98509216308594, "objective/kl": 14.87851619720459, "objective/non_score_reward": -1.487851619720459, "objective/rlhf_reward": -4.435634577068027, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 57.47890853881836, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7034124135971069, "step": 44, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998563528060913 }, { "episode": 736, "epoch": 0.013229320199877773, "loss/policy_avg": 1.027585744857788, "lr": 9.971242331288345e-06, "objective/entropy": 119.49530792236328, "objective/kl": 18.71068572998047, "objective/non_score_reward": -1.8710683584213257, "objective/rlhf_reward": -5.659444983276437, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 190.9130859375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9129630923271179, "step": 45, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993374347686768 }, { "episode": 752, "epoch": 0.01351691411726642, "loss/policy_avg": -0.013658525422215462, "lr": 9.97060327198364e-06, "objective/entropy": 10.491897583007812, "objective/kl": 13.526758193969727, "objective/non_score_reward": -1.3526759147644043, "objective/rlhf_reward": -1.0107036590576168, "objective/scores": 1.1, "policy/approxkl_avg": 102.07904815673828, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.482522189617157, "step": 46, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0015859603881836 }, { "episode": 768, "epoch": 0.013804508034655067, "loss/policy_avg": 0.18925166130065918, "lr": 9.969964212678937e-06, "objective/entropy": -118.86809539794922, "objective/kl": 10.978793144226074, "objective/non_score_reward": -1.097879409790039, "objective/rlhf_reward": -3.0498822390133435, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 104.2835693359375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6507794857025146, "step": 47, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996565341949463 }, { "episode": 784, "epoch": 0.014092101952043714, "loss/policy_avg": 0.5690521597862244, "lr": 9.969325153374234e-06, "objective/entropy": -175.16403198242188, "objective/kl": 19.28797149658203, "objective/non_score_reward": -1.9287970066070557, "objective/rlhf_reward": -6.158928765860155, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 288.65631103515625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6812887787818909, "step": 48, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999005794525146 }, { "episode": 800, "epoch": 0.01437969586943236, "loss/policy_avg": 0.5041743516921997, "lr": 9.968686094069531e-06, "objective/entropy": 318.1710205078125, "objective/kl": 17.975252151489258, "objective/non_score_reward": -1.79752516746521, "objective/rlhf_reward": -5.84846519520822, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 390.5566101074219, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8034517765045166, "step": 49, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9982198476791382 }, { "episode": 816, "epoch": 0.01466728978682101, "loss/policy_avg": 0.21048909425735474, "lr": 9.968047034764828e-06, "objective/entropy": 16.597354888916016, "objective/kl": 22.140174865722656, "objective/non_score_reward": -2.214017629623413, "objective/rlhf_reward": -6.733364107386146, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 327.35992431640625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4595518708229065, "step": 50, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974133968353271 }, { "episode": 832, "epoch": 0.014954883704209656, "loss/policy_avg": 0.6745895147323608, "lr": 9.967407975460123e-06, "objective/entropy": 26.577850341796875, "objective/kl": 15.099103927612305, "objective/non_score_reward": -1.5099103450775146, "objective/rlhf_reward": -4.435521040026265, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 52.37441635131836, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5440022945404053, "step": 51, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997567057609558 }, { "episode": 848, "epoch": 0.015242477621598303, "loss/policy_avg": 1.5183483362197876, "lr": 9.96676891615542e-06, "objective/entropy": -133.34732055664062, "objective/kl": 15.838411331176758, "objective/non_score_reward": -1.58384108543396, "objective/rlhf_reward": -4.9937288074785755, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 181.23886108398438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.491477906703949, "step": 52, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999549388885498 }, { "episode": 864, "epoch": 0.01553007153898695, "loss/policy_avg": 1.0412685871124268, "lr": 9.966129856850717e-06, "objective/entropy": -12.032562255859375, "objective/kl": 13.811055183410645, "objective/non_score_reward": -1.3811054229736328, "objective/rlhf_reward": -4.008649730476078, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 197.14422607421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5553240776062012, "step": 53, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998724341392517 }, { "episode": 880, "epoch": 0.0158176654563756, "loss/policy_avg": 0.11221161484718323, "lr": 9.965490797546014e-06, "objective/entropy": 239.8121795654297, "objective/kl": 14.03902816772461, "objective/non_score_reward": -1.4039026498794556, "objective/rlhf_reward": -3.790781910690378, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 54.992515563964844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8224223852157593, "step": 54, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998995304107666 }, { "episode": 896, "epoch": 0.016105259373764245, "loss/policy_avg": 0.27755433320999146, "lr": 9.96485173824131e-06, "objective/entropy": 311.885009765625, "objective/kl": 21.855777740478516, "objective/non_score_reward": -2.185577630996704, "objective/rlhf_reward": -8.742310643196106, "objective/scores": 0.0, "policy/approxkl_avg": 196.02963256835938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7535547018051147, "step": 55, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9981474876403809 }, { "episode": 912, "epoch": 0.016392853291152892, "loss/policy_avg": 0.23765933513641357, "lr": 9.964212678936606e-06, "objective/entropy": -135.26939392089844, "objective/kl": 17.994558334350586, "objective/non_score_reward": -1.7994558811187744, "objective/rlhf_reward": -5.464490548769633, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 143.15103149414062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7061681747436523, "step": 56, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001481771469116 }, { "episode": 928, "epoch": 0.01668044720854154, "loss/policy_avg": -0.09936670958995819, "lr": 9.963573619631903e-06, "objective/entropy": 273.41107177734375, "objective/kl": 17.296648025512695, "objective/non_score_reward": -1.7296650409698486, "objective/rlhf_reward": -5.559410088990612, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 170.04476928710938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.664265513420105, "step": 57, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.996645212173462 }, { "episode": 944, "epoch": 0.016968041125930186, "loss/policy_avg": -0.40471351146698, "lr": 9.9629345603272e-06, "objective/entropy": 91.30682373046875, "objective/kl": 9.7944974899292, "objective/non_score_reward": -0.9794497489929199, "objective/rlhf_reward": -2.361539780107096, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 46.471343994140625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6433554887771606, "step": 58, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0122342109680176 }, { "episode": 960, "epoch": 0.017255635043318833, "loss/policy_avg": 0.758541464805603, "lr": 9.962295501022495e-06, "objective/entropy": -28.099227905273438, "objective/kl": 15.942657470703125, "objective/non_score_reward": -1.5942658185958862, "objective/rlhf_reward": -4.2543568483748775, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 199.81590270996094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6792592406272888, "step": 59, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988605976104736 }, { "episode": 976, "epoch": 0.01754322896070748, "loss/policy_avg": 0.11890214681625366, "lr": 9.961656441717792e-06, "objective/entropy": 48.058135986328125, "objective/kl": 16.837148666381836, "objective/non_score_reward": -1.6837148666381836, "objective/rlhf_reward": -5.130739483896809, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 123.93780517578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8319634199142456, "step": 60, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987045526504517 }, { "episode": 992, "epoch": 0.017830822878096127, "loss/policy_avg": -0.3681066036224365, "lr": 9.961017382413088e-06, "objective/entropy": -12.798896789550781, "objective/kl": 13.068469047546387, "objective/non_score_reward": -1.3068468570709229, "objective/rlhf_reward": -3.8857517450148156, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 29.726402282714844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.425646036863327, "step": 61, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.004940986633301 }, { "episode": 1008, "epoch": 0.018118416795484777, "loss/policy_avg": 0.6650391221046448, "lr": 9.960378323108385e-06, "objective/entropy": 172.82774353027344, "objective/kl": 19.25320053100586, "objective/non_score_reward": -1.9253199100494385, "objective/rlhf_reward": -5.753868768887456, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 61.55193328857422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.707613468170166, "step": 62, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996411800384521 }, { "episode": 1024, "epoch": 0.018406010712873424, "loss/policy_avg": 0.006029143929481506, "lr": 9.959739263803682e-06, "objective/entropy": 84.45201110839844, "objective/kl": 9.024871826171875, "objective/non_score_reward": -0.9024871587753296, "objective/rlhf_reward": -2.1593505545571894, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 54.238502502441406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7926748991012573, "step": 63, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0010342597961426 }, { "episode": 1040, "epoch": 0.01869360463026207, "loss/policy_avg": 0.43513649702072144, "lr": 9.959100204498979e-06, "objective/entropy": 271.2078857421875, "objective/kl": 19.053577423095703, "objective/non_score_reward": -1.905357837677002, "objective/rlhf_reward": -6.2959189749061295, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 146.22186279296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 1.0022022724151611, "step": 64, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983612298965454 }, { "episode": 1056, "epoch": 0.018981198547650718, "loss/policy_avg": 0.207576721906662, "lr": 9.958461145194274e-06, "objective/entropy": 61.16169738769531, "objective/kl": 12.455079078674316, "objective/non_score_reward": -1.2455079555511475, "objective/rlhf_reward": -3.1572031929817905, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 90.93212890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8675985336303711, "step": 65, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000087022781372 }, { "episode": 1072, "epoch": 0.019268792465039365, "loss/policy_avg": 0.036766890436410904, "lr": 9.957822085889571e-06, "objective/entropy": -109.95204162597656, "objective/kl": 18.774991989135742, "objective/non_score_reward": -1.8774993419647217, "objective/rlhf_reward": -5.776663676897684, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 143.13140869140625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6979721188545227, "step": 66, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986827373504639 }, { "episode": 1088, "epoch": 0.01955638638242801, "loss/policy_avg": 0.2812209725379944, "lr": 9.957183026584868e-06, "objective/entropy": 347.89093017578125, "objective/kl": 18.375164031982422, "objective/non_score_reward": -1.8375165462493896, "objective/rlhf_reward": -5.688206379831421, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 156.17637634277344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8508192300796509, "step": 67, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0011837482452393 }, { "episode": 1104, "epoch": 0.019843980299816658, "loss/policy_avg": 0.4886673092842102, "lr": 9.956543967280165e-06, "objective/entropy": 386.39532470703125, "objective/kl": 21.181537628173828, "objective/non_score_reward": -2.1181535720825195, "objective/rlhf_reward": -8.472614765167236, "objective/scores": 0.0, "policy/approxkl_avg": 175.6392822265625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9587714672088623, "step": 68, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9981954097747803 }, { "episode": 1120, "epoch": 0.020131574217205305, "loss/policy_avg": 0.20124448835849762, "lr": 9.955904907975462e-06, "objective/entropy": -67.81639099121094, "objective/kl": 18.70073127746582, "objective/non_score_reward": -1.8700731992721558, "objective/rlhf_reward": -6.05646069784936, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 92.5486831665039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6175429224967957, "step": 69, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00050950050354 }, { "episode": 1136, "epoch": 0.020419168134593952, "loss/policy_avg": -0.021352097392082214, "lr": 9.955265848670757e-06, "objective/entropy": -17.604766845703125, "objective/kl": 21.45330810546875, "objective/non_score_reward": -2.1453306674957275, "objective/rlhf_reward": -6.919463162839996, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 400.26580810546875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9343768358230591, "step": 70, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998741149902344 }, { "episode": 1152, "epoch": 0.0207067620519826, "loss/policy_avg": 1.1225731372833252, "lr": 9.954626789366054e-06, "objective/entropy": 25.78099822998047, "objective/kl": 14.004438400268555, "objective/non_score_reward": -1.4004437923431396, "objective/rlhf_reward": -3.776946599754404, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 111.43013000488281, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5702972412109375, "step": 71, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973070621490479 }, { "episode": 1168, "epoch": 0.02099435596937125, "loss/policy_avg": 0.25385117530822754, "lr": 9.95398773006135e-06, "objective/entropy": -119.72091674804688, "objective/kl": 15.869585037231445, "objective/non_score_reward": -1.586958408355713, "objective/rlhf_reward": -4.400422762112553, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 68.1309814453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6043493151664734, "step": 72, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9978251457214355 }, { "episode": 1184, "epoch": 0.021281949886759896, "loss/policy_avg": 1.0585708618164062, "lr": 9.953348670756648e-06, "objective/entropy": 175.019775390625, "objective/kl": 22.812929153442383, "objective/non_score_reward": -2.2812929153442383, "objective/rlhf_reward": -7.7996585703193375, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 238.55490112304688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.764386773109436, "step": 73, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998070478439331 }, { "episode": 1200, "epoch": 0.021569543804148543, "loss/policy_avg": 0.3202959895133972, "lr": 9.952709611451944e-06, "objective/entropy": -55.03008270263672, "objective/kl": 20.011316299438477, "objective/non_score_reward": -2.001131772994995, "objective/rlhf_reward": -5.60452709197998, "objective/scores": 0.6, "policy/approxkl_avg": 218.61663818359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7646293640136719, "step": 74, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000119686126709 }, { "episode": 1216, "epoch": 0.02185713772153719, "loss/policy_avg": 0.07566210627555847, "lr": 9.952070552147241e-06, "objective/entropy": -71.89826965332031, "objective/kl": 18.70985984802246, "objective/non_score_reward": -1.8709862232208252, "objective/rlhf_reward": -5.536533663945134, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 150.85037231445312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.622381865978241, "step": 75, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000347137451172 }, { "episode": 1232, "epoch": 0.022144731638925837, "loss/policy_avg": 1.0919684171676636, "lr": 9.951431492842536e-06, "objective/entropy": 52.233760833740234, "objective/kl": 16.872692108154297, "objective/non_score_reward": -1.6872694492340088, "objective/rlhf_reward": -5.1449576354661755, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 106.15821838378906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5982179641723633, "step": 76, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982967376708984 }, { "episode": 1248, "epoch": 0.022432325556314484, "loss/policy_avg": 0.08637362718582153, "lr": 9.950792433537833e-06, "objective/entropy": -41.915985107421875, "objective/kl": 13.70026969909668, "objective/non_score_reward": -1.37002694606781, "objective/rlhf_reward": -3.81824833673297, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 65.06179809570312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7901297807693481, "step": 77, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979660511016846 }, { "episode": 1264, "epoch": 0.02271991947370313, "loss/policy_avg": 0.25904232263565063, "lr": 9.950153374233129e-06, "objective/entropy": 80.24528503417969, "objective/kl": 11.508593559265137, "objective/non_score_reward": -1.1508593559265137, "objective/rlhf_reward": -3.1224849848107095, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 80.6361312866211, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5768579244613647, "step": 78, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997675895690918 }, { "episode": 1280, "epoch": 0.023007513391091777, "loss/policy_avg": 1.0851349830627441, "lr": 9.949514314928425e-06, "objective/entropy": 179.42474365234375, "objective/kl": 17.690536499023438, "objective/non_score_reward": -1.7690538167953491, "objective/rlhf_reward": -5.128804068045552, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 140.53465270996094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5955301523208618, "step": 79, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9972807168960571 }, { "episode": 1296, "epoch": 0.023295107308480424, "loss/policy_avg": 0.10646107792854309, "lr": 9.948875255623722e-06, "objective/entropy": 192.84939575195312, "objective/kl": 20.0378360748291, "objective/non_score_reward": -2.003783702850342, "objective/rlhf_reward": -8.015134632587433, "objective/scores": 0.0, "policy/approxkl_avg": 228.60391235351562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5153179168701172, "step": 80, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999485969543457 }, { "episode": 1312, "epoch": 0.023582701225869074, "loss/policy_avg": 0.4094586968421936, "lr": 9.94823619631902e-06, "objective/entropy": -145.0159912109375, "objective/kl": 16.018333435058594, "objective/non_score_reward": -1.6018333435058594, "objective/rlhf_reward": -6.407333076000214, "objective/scores": 0.0, "policy/approxkl_avg": 113.58265686035156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6427879333496094, "step": 81, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979770183563232 }, { "episode": 1328, "epoch": 0.02387029514325772, "loss/policy_avg": 0.5852205753326416, "lr": 9.947597137014316e-06, "objective/entropy": -188.32510375976562, "objective/kl": 16.71861457824707, "objective/non_score_reward": -1.6718615293502808, "objective/rlhf_reward": -5.328196370337887, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 47.16347122192383, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4781952202320099, "step": 82, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979345798492432 }, { "episode": 1344, "epoch": 0.024157889060646368, "loss/policy_avg": 0.7273481488227844, "lr": 9.946958077709611e-06, "objective/entropy": -261.5775146484375, "objective/kl": 17.21273422241211, "objective/non_score_reward": -1.721273422241211, "objective/rlhf_reward": -5.280973825518208, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 91.5807113647461, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7938202619552612, "step": 83, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998490333557129 }, { "episode": 1360, "epoch": 0.024445482978035015, "loss/policy_avg": 0.6384750604629517, "lr": 9.946319018404908e-06, "objective/entropy": -149.5916290283203, "objective/kl": 21.390371322631836, "objective/non_score_reward": -2.1390371322631836, "objective/rlhf_reward": -7.230635855227632, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 152.96697998046875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.733663022518158, "step": 84, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9965767860412598 }, { "episode": 1376, "epoch": 0.024733076895423662, "loss/policy_avg": 1.4870085716247559, "lr": 9.945679959100205e-06, "objective/entropy": 279.02581787109375, "objective/kl": 13.598295211791992, "objective/non_score_reward": -1.3598296642303467, "objective/rlhf_reward": -4.015486557682125, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 75.3885269165039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9894934296607971, "step": 85, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0009098052978516 }, { "episode": 1392, "epoch": 0.02502067081281231, "loss/policy_avg": 0.34267449378967285, "lr": 9.945040899795502e-06, "objective/entropy": -9.687551498413086, "objective/kl": 17.537944793701172, "objective/non_score_reward": -1.7537946701049805, "objective/rlhf_reward": -4.091459666134092, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 86.10018920898438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7380132675170898, "step": 86, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9966371059417725 }, { "episode": 1408, "epoch": 0.025308264730200956, "loss/policy_avg": 0.662402868270874, "lr": 9.944401840490799e-06, "objective/entropy": 182.38612365722656, "objective/kl": 17.891094207763672, "objective/non_score_reward": -1.789109230041504, "objective/rlhf_reward": -5.705838660807952, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 27.472164154052734, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6948930025100708, "step": 87, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9972063302993774 }, { "episode": 1424, "epoch": 0.025595858647589603, "loss/policy_avg": 2.4642419815063477, "lr": 9.943762781186096e-06, "objective/entropy": 9.746139526367188, "objective/kl": 17.692127227783203, "objective/non_score_reward": -1.7692127227783203, "objective/rlhf_reward": -5.735215237646727, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 221.2765350341797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6335443258285522, "step": 88, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997534990310669 }, { "episode": 1440, "epoch": 0.02588345256497825, "loss/policy_avg": 0.4348924160003662, "lr": 9.94312372188139e-06, "objective/entropy": 134.22723388671875, "objective/kl": 27.34999656677246, "objective/non_score_reward": -2.734999895095825, "objective/rlhf_reward": -9.278140073240387, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 201.6007080078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8156861662864685, "step": 89, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973480701446533 }, { "episode": 1456, "epoch": 0.026171046482366896, "loss/policy_avg": 0.1291811764240265, "lr": 9.942484662576688e-06, "objective/entropy": 112.80955505371094, "objective/kl": 15.703033447265625, "objective/non_score_reward": -1.5703033208847046, "objective/rlhf_reward": -4.333801994996007, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 155.86367797851562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5453826189041138, "step": 90, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969561100006104 }, { "episode": 1472, "epoch": 0.026458640399755547, "loss/policy_avg": 0.5085259675979614, "lr": 9.941845603271985e-06, "objective/entropy": 178.85531616210938, "objective/kl": 17.638357162475586, "objective/non_score_reward": -1.7638356685638428, "objective/rlhf_reward": -5.574390235360026, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 101.44283294677734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.42108362913131714, "step": 91, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9999537467956543 }, { "episode": 1488, "epoch": 0.026746234317144194, "loss/policy_avg": 0.6299684643745422, "lr": 9.941206543967281e-06, "objective/entropy": -44.030662536621094, "objective/kl": 20.842021942138672, "objective/non_score_reward": -2.0842020511627197, "objective/rlhf_reward": -6.977558457587643, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 154.17724609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8085545301437378, "step": 92, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997336506843567 }, { "episode": 1504, "epoch": 0.02703382823453284, "loss/policy_avg": 0.334034264087677, "lr": 9.940567484662578e-06, "objective/entropy": 137.08668518066406, "objective/kl": 12.822792053222656, "objective/non_score_reward": -1.2822792530059814, "objective/rlhf_reward": -5.129117101430893, "objective/scores": 0.0, "policy/approxkl_avg": 28.62427520751953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5771763920783997, "step": 93, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001514196395874 }, { "episode": 1520, "epoch": 0.027321422151921487, "loss/policy_avg": 0.5848271250724792, "lr": 9.939928425357874e-06, "objective/entropy": -91.07750701904297, "objective/kl": 12.661925315856934, "objective/non_score_reward": -1.2661924362182617, "objective/rlhf_reward": -3.239941294464182, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 20.221342086791992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7387726306915283, "step": 94, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9965746402740479 }, { "episode": 1536, "epoch": 0.027609016069310134, "loss/policy_avg": 0.45774608850479126, "lr": 9.93928936605317e-06, "objective/entropy": 61.91606903076172, "objective/kl": 13.879372596740723, "objective/non_score_reward": -1.387937307357788, "objective/rlhf_reward": -4.151749169826507, "objective/scores": 0.35, "policy/approxkl_avg": 77.81513977050781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4593799114227295, "step": 95, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001408576965332 }, { "episode": 1552, "epoch": 0.02789660998669878, "loss/policy_avg": 0.15000438690185547, "lr": 9.938650306748467e-06, "objective/entropy": 105.67562866210938, "objective/kl": 19.344045639038086, "objective/non_score_reward": -1.9344044923782349, "objective/rlhf_reward": -6.287019650550231, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 139.15414428710938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7023290991783142, "step": 96, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000124454498291 }, { "episode": 1568, "epoch": 0.028184203904087428, "loss/policy_avg": 0.15950141847133636, "lr": 9.938011247443764e-06, "objective/entropy": 163.98699951171875, "objective/kl": 27.47066307067871, "objective/non_score_reward": -2.7470664978027344, "objective/rlhf_reward": -8.865559878126655, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 212.09678649902344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8216714859008789, "step": 97, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999010562896729 }, { "episode": 1584, "epoch": 0.028471797821476075, "loss/policy_avg": 0.08940532058477402, "lr": 9.937372188139061e-06, "objective/entropy": -107.25084686279297, "objective/kl": 20.170251846313477, "objective/non_score_reward": -2.0170252323150635, "objective/rlhf_reward": -5.144382034183714, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 45.363121032714844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7300325632095337, "step": 98, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9954917430877686 }, { "episode": 1600, "epoch": 0.02875939173886472, "loss/policy_avg": 0.1482250690460205, "lr": 9.936733128834358e-06, "objective/entropy": 225.79022216796875, "objective/kl": 18.687984466552734, "objective/non_score_reward": -1.8687984943389893, "objective/rlhf_reward": -6.024595717997894, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 224.1140594482422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.834899365901947, "step": 99, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9977468252182007 }, { "episode": 1616, "epoch": 0.029046985656253372, "loss/policy_avg": 0.1928468644618988, "lr": 9.936094069529653e-06, "objective/entropy": 8.741950988769531, "objective/kl": 15.258420944213867, "objective/non_score_reward": -1.5258420705795288, "objective/rlhf_reward": -4.652770261378631, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 31.214942932128906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5689660310745239, "step": 100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975221157073975 }, { "episode": 1632, "epoch": 0.02933457957364202, "loss/policy_avg": -0.0598021075129509, "lr": 9.93545501022495e-06, "objective/entropy": -18.486255645751953, "objective/kl": 25.29681396484375, "objective/non_score_reward": -2.5296812057495117, "objective/rlhf_reward": -7.718725478649139, "objective/scores": 0.6, "policy/approxkl_avg": 111.83526611328125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5892493724822998, "step": 101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977436065673828 }, { "episode": 1648, "epoch": 0.029622173491030666, "loss/policy_avg": 0.47538769245147705, "lr": 9.934815950920245e-06, "objective/entropy": -110.20201873779297, "objective/kl": 22.853384017944336, "objective/non_score_reward": -2.2853384017944336, "objective/rlhf_reward": -7.690755407424316, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 73.58186340332031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7267776727676392, "step": 102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981677532196045 }, { "episode": 1664, "epoch": 0.029909767408419313, "loss/policy_avg": -0.00016094697639346123, "lr": 9.934176891615542e-06, "objective/entropy": 144.46910095214844, "objective/kl": 16.481285095214844, "objective/non_score_reward": -1.6481282711029053, "objective/rlhf_reward": -5.26700029882781, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 115.55538177490234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8190140724182129, "step": 103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999662160873413 }, { "episode": 1680, "epoch": 0.03019736132580796, "loss/policy_avg": 0.3124885559082031, "lr": 9.933537832310839e-06, "objective/entropy": 82.22334289550781, "objective/kl": 23.603931427001953, "objective/non_score_reward": -2.3603932857513428, "objective/rlhf_reward": -8.04157326221466, "objective/scores": 0.35, "policy/approxkl_avg": 249.64776611328125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6342385411262512, "step": 104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999876022338867 }, { "episode": 1696, "epoch": 0.030484955243196606, "loss/policy_avg": 0.5430713891983032, "lr": 9.932898773006136e-06, "objective/entropy": 120.59968566894531, "objective/kl": 16.078868865966797, "objective/non_score_reward": -1.6078869104385376, "objective/rlhf_reward": -4.87528809806402, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 28.921520233154297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7989763021469116, "step": 105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993932247161865 }, { "episode": 1712, "epoch": 0.030772549160585253, "loss/policy_avg": 0.40486371517181396, "lr": 9.932259713701433e-06, "objective/entropy": -83.70709228515625, "objective/kl": 21.060504913330078, "objective/non_score_reward": -2.106050491333008, "objective/rlhf_reward": -6.908430480750736, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 57.908729553222656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8018752932548523, "step": 106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995449781417847 }, { "episode": 1728, "epoch": 0.0310601430779739, "loss/policy_avg": 0.4627416133880615, "lr": 9.931620654396728e-06, "objective/entropy": -27.708335876464844, "objective/kl": 19.676761627197266, "objective/non_score_reward": -1.9676761627197266, "objective/rlhf_reward": -7.870704412460327, "objective/scores": 0.0, "policy/approxkl_avg": 89.03375244140625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7063722014427185, "step": 107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978171586990356 }, { "episode": 1744, "epoch": 0.03134773699536255, "loss/policy_avg": 0.24644207954406738, "lr": 9.930981595092025e-06, "objective/entropy": 84.95053100585938, "objective/kl": 17.334156036376953, "objective/non_score_reward": -1.7334158420562744, "objective/rlhf_reward": -5.417891824039158, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 59.91339111328125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5948917269706726, "step": 108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983779191970825 }, { "episode": 1760, "epoch": 0.0316353309127512, "loss/policy_avg": 0.10573781281709671, "lr": 9.930342535787322e-06, "objective/entropy": 37.63609313964844, "objective/kl": 19.209318161010742, "objective/non_score_reward": -1.9209318161010742, "objective/rlhf_reward": -5.56102097250608, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 49.033843994140625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.550654411315918, "step": 109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999387264251709 }, { "episode": 1776, "epoch": 0.031922924830139844, "loss/policy_avg": 1.9212778806686401, "lr": 9.929703476482619e-06, "objective/entropy": -80.58729553222656, "objective/kl": 21.281909942626953, "objective/non_score_reward": -2.1281909942626953, "objective/rlhf_reward": -6.779431001345316, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 400.0589599609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7861940860748291, "step": 110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973957538604736 }, { "episode": 1792, "epoch": 0.03221051874752849, "loss/policy_avg": 0.275944322347641, "lr": 9.929064417177915e-06, "objective/entropy": -208.78277587890625, "objective/kl": 19.070934295654297, "objective/non_score_reward": -1.9070935249328613, "objective/rlhf_reward": -3.228373861312866, "objective/scores": 1.1, "policy/approxkl_avg": 49.866607666015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6659146547317505, "step": 111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001814842224121 }, { "episode": 1808, "epoch": 0.03249811266491714, "loss/policy_avg": 0.01928192749619484, "lr": 9.928425357873212e-06, "objective/entropy": -55.08910369873047, "objective/kl": 13.248590469360352, "objective/non_score_reward": -1.3248591423034668, "objective/rlhf_reward": -3.940187000964565, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 120.32939147949219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7349205017089844, "step": 112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983385801315308 }, { "episode": 1824, "epoch": 0.032785706582305785, "loss/policy_avg": 0.46047085523605347, "lr": 9.927786298568507e-06, "objective/entropy": 62.72016906738281, "objective/kl": 23.932682037353516, "objective/non_score_reward": -2.39326810836792, "objective/rlhf_reward": -5.173073148727417, "objective/scores": 1.1, "policy/approxkl_avg": 117.28857421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7454954385757446, "step": 113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001185894012451 }, { "episode": 1840, "epoch": 0.03307330049969443, "loss/policy_avg": 0.45565682649612427, "lr": 9.927147239263804e-06, "objective/entropy": 91.39878845214844, "objective/kl": 23.617773056030273, "objective/non_score_reward": -2.3617773056030273, "objective/rlhf_reward": -8.121596846610231, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 150.77520751953125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6033438444137573, "step": 114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999167919158936 }, { "episode": 1856, "epoch": 0.03336089441708308, "loss/policy_avg": 0.7295475602149963, "lr": 9.926508179959101e-06, "objective/entropy": 207.79177856445312, "objective/kl": 27.30187225341797, "objective/non_score_reward": -2.73018741607666, "objective/rlhf_reward": -9.187415854136148, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 190.16458129882812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.674660861492157, "step": 115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992830753326416 }, { "episode": 1872, "epoch": 0.033648488334471725, "loss/policy_avg": 0.6712960600852966, "lr": 9.925869120654398e-06, "objective/entropy": -117.77911376953125, "objective/kl": 24.611560821533203, "objective/non_score_reward": -2.461156129837036, "objective/rlhf_reward": -8.485374891494198, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 180.31971740722656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8039132356643677, "step": 116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9965860843658447 }, { "episode": 1888, "epoch": 0.03393608225186037, "loss/policy_avg": 0.34298884868621826, "lr": 9.925230061349695e-06, "objective/entropy": -77.61781311035156, "objective/kl": 19.15323829650879, "objective/non_score_reward": -1.9153238534927368, "objective/rlhf_reward": -5.2612955331802365, "objective/scores": 0.6, "policy/approxkl_avg": 146.52737426757812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5704625248908997, "step": 117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994972944259644 }, { "episode": 1904, "epoch": 0.03422367616924902, "loss/policy_avg": 0.038467422127723694, "lr": 9.92459100204499e-06, "objective/entropy": 49.699798583984375, "objective/kl": 14.500207901000977, "objective/non_score_reward": -1.4500207901000977, "objective/rlhf_reward": -4.3762512101727395, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 53.40303421020508, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7844617366790771, "step": 118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998944878578186 }, { "episode": 1920, "epoch": 0.034511270086637666, "loss/policy_avg": 0.21085724234580994, "lr": 9.923951942740287e-06, "objective/entropy": 171.3436279296875, "objective/kl": 26.085678100585938, "objective/non_score_reward": -2.608567714691162, "objective/rlhf_reward": -10.434271335601807, "objective/scores": 0.0, "policy/approxkl_avg": 62.63068389892578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7893345355987549, "step": 119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0008397102355957 }, { "episode": 1936, "epoch": 0.03479886400402631, "loss/policy_avg": 2.211275100708008, "lr": 9.923312883435584e-06, "objective/entropy": -61.354347229003906, "objective/kl": 20.615779876708984, "objective/non_score_reward": -2.0615780353546143, "objective/rlhf_reward": -6.82247974415597, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 51.41122055053711, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6323862075805664, "step": 120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0031816959381104 }, { "episode": 1952, "epoch": 0.03508645792141496, "loss/policy_avg": 0.09407002478837967, "lr": 9.92267382413088e-06, "objective/entropy": 61.494049072265625, "objective/kl": 16.42398452758789, "objective/non_score_reward": -1.6423983573913574, "objective/rlhf_reward": -5.053821527751621, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 42.54444122314453, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6436998844146729, "step": 121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003204345703125 }, { "episode": 1968, "epoch": 0.03537405183880361, "loss/policy_avg": 4.1326775550842285, "lr": 9.922034764826178e-06, "objective/entropy": 182.5974884033203, "objective/kl": 23.137367248535156, "objective/non_score_reward": -2.313736915588379, "objective/rlhf_reward": -7.895697408650799, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 43.000953674316406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4477632939815521, "step": 122, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9996920824050903 }, { "episode": 1984, "epoch": 0.03566164575619225, "loss/policy_avg": 0.6063717603683472, "lr": 9.921395705521473e-06, "objective/entropy": 374.73065185546875, "objective/kl": 19.553401947021484, "objective/non_score_reward": -1.9553401470184326, "objective/rlhf_reward": -6.4213602304458615, "objective/scores": 0.35, "policy/approxkl_avg": 66.70773315429688, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8944922685623169, "step": 123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997408151626587 }, { "episode": 2000, "epoch": 0.03594923967358091, "loss/policy_avg": -0.24504688382148743, "lr": 9.92075664621677e-06, "objective/entropy": 61.65036392211914, "objective/kl": 24.3512020111084, "objective/non_score_reward": -2.435120105743408, "objective/rlhf_reward": -8.39884465029779, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 52.655914306640625, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7538488507270813, "step": 124, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.018387794494629 }, { "episode": 2016, "epoch": 0.036236833590969554, "loss/policy_avg": 0.5044976472854614, "lr": 9.920117586912067e-06, "objective/entropy": 252.68862915039062, "objective/kl": 31.767620086669922, "objective/non_score_reward": -3.176762104034424, "objective/rlhf_reward": -11.34779831144659, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 143.51535034179688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6717317700386047, "step": 125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9978291988372803 }, { "episode": 2032, "epoch": 0.0365244275083582, "loss/policy_avg": 0.5120067596435547, "lr": 9.919478527607362e-06, "objective/entropy": -164.51651000976562, "objective/kl": 27.349618911743164, "objective/non_score_reward": -2.734961986541748, "objective/rlhf_reward": -9.539847826957702, "objective/scores": 0.35, "policy/approxkl_avg": 40.27062225341797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7697643041610718, "step": 126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998326301574707 }, { "episode": 2048, "epoch": 0.03681202142574685, "loss/policy_avg": 0.6887588500976562, "lr": 9.918839468302659e-06, "objective/entropy": 69.27521514892578, "objective/kl": 22.67069435119629, "objective/non_score_reward": -2.2670693397521973, "objective/rlhf_reward": -7.6896756077683985, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 162.61952209472656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6613045930862427, "step": 127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999122977256775 }, { "episode": 2064, "epoch": 0.037099615343135495, "loss/policy_avg": 0.3447116017341614, "lr": 9.918200408997956e-06, "objective/entropy": 212.85906982421875, "objective/kl": 26.792640686035156, "objective/non_score_reward": -2.6792640686035156, "objective/rlhf_reward": -8.983723298708597, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 97.52276611328125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9090495109558105, "step": 128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001721382141113 }, { "episode": 2080, "epoch": 0.03738720926052414, "loss/policy_avg": 0.48409304022789, "lr": 9.917561349693252e-06, "objective/entropy": 190.03709411621094, "objective/kl": 18.16942596435547, "objective/non_score_reward": -1.8169424533843994, "objective/rlhf_reward": -5.145064058081184, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 45.51705551147461, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46248874068260193, "step": 129, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9967193603515625 }, { "episode": 2096, "epoch": 0.03767480317791279, "loss/policy_avg": 0.5664651989936829, "lr": 9.91692229038855e-06, "objective/entropy": 192.7556610107422, "objective/kl": 20.14044952392578, "objective/non_score_reward": -2.014044761657715, "objective/rlhf_reward": -6.231350894245217, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 29.526737213134766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8081471920013428, "step": 130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979431629180908 }, { "episode": 2112, "epoch": 0.037962397095301435, "loss/policy_avg": 0.41585665941238403, "lr": 9.916283231083844e-06, "objective/entropy": -49.55967712402344, "objective/kl": 34.61396789550781, "objective/non_score_reward": -3.4613966941833496, "objective/rlhf_reward": -12.466984846679072, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 176.34915161132812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7800864577293396, "step": 131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994279146194458 }, { "episode": 2128, "epoch": 0.03824999101269008, "loss/policy_avg": 0.11256570369005203, "lr": 9.915644171779141e-06, "objective/entropy": 156.05429077148438, "objective/kl": 29.602603912353516, "objective/non_score_reward": -2.9602606296539307, "objective/rlhf_reward": -7.441042518615722, "objective/scores": 1.1, "policy/approxkl_avg": 77.59361267089844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8300012350082397, "step": 132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990795850753784 }, { "episode": 2144, "epoch": 0.03853758493007873, "loss/policy_avg": 0.5523125529289246, "lr": 9.915005112474438e-06, "objective/entropy": -9.20687484741211, "objective/kl": 18.338748931884766, "objective/non_score_reward": -1.8338749408721924, "objective/rlhf_reward": -5.510670955452035, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 8.425865173339844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6365004777908325, "step": 133, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000248908996582 }, { "episode": 2160, "epoch": 0.038825178847467376, "loss/policy_avg": -0.049435317516326904, "lr": 9.914366053169735e-06, "objective/entropy": -86.67240142822266, "objective/kl": 31.985502243041992, "objective/non_score_reward": -3.198550224304199, "objective/rlhf_reward": -11.237941711154535, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 199.85411071777344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7536939382553101, "step": 134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000448703765869 }, { "episode": 2176, "epoch": 0.03911277276485602, "loss/policy_avg": 2.6105287075042725, "lr": 9.913726993865032e-06, "objective/entropy": -58.581077575683594, "objective/kl": 16.238283157348633, "objective/non_score_reward": -1.6238282918930054, "objective/rlhf_reward": -6.495313286781311, "objective/scores": 0.0, "policy/approxkl_avg": 10.216068267822266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5845435857772827, "step": 135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.019744396209717 }, { "episode": 2192, "epoch": 0.03940036668224467, "loss/policy_avg": 0.24281054735183716, "lr": 9.913087934560329e-06, "objective/entropy": -94.54998779296875, "objective/kl": 24.751571655273438, "objective/non_score_reward": -2.4751572608947754, "objective/rlhf_reward": -8.384857022556004, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 124.94600677490234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6612954139709473, "step": 136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9992079734802246 }, { "episode": 2208, "epoch": 0.039687960599633317, "loss/policy_avg": 0.3692808449268341, "lr": 9.912448875255624e-06, "objective/entropy": 131.55032348632812, "objective/kl": 33.53261184692383, "objective/non_score_reward": -3.3532609939575195, "objective/rlhf_reward": -10.489325438381407, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 40.712066650390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6237879395484924, "step": 137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9970109462738037 }, { "episode": 2224, "epoch": 0.03997555451702196, "loss/policy_avg": 0.7208542823791504, "lr": 9.911809815950921e-06, "objective/entropy": -9.526227951049805, "objective/kl": 23.76919937133789, "objective/non_score_reward": -2.376919984817505, "objective/rlhf_reward": -9.507680296897888, "objective/scores": 0.0, "policy/approxkl_avg": 324.29388427734375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5352585315704346, "step": 138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988393783569336 }, { "episode": 2240, "epoch": 0.04026314843441061, "loss/policy_avg": 0.24535533785820007, "lr": 9.911170756646218e-06, "objective/entropy": -117.0184555053711, "objective/kl": 12.411272048950195, "objective/non_score_reward": -1.2411272525787354, "objective/rlhf_reward": -3.5139104529336542, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 86.15127563476562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6268381476402283, "step": 139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967679977416992 }, { "episode": 2256, "epoch": 0.04055074235179926, "loss/policy_avg": 1.7714985609054565, "lr": 9.910531697341515e-06, "objective/entropy": -78.9849853515625, "objective/kl": 19.77252197265625, "objective/non_score_reward": -1.9772523641586304, "objective/rlhf_reward": -7.9090094566345215, "objective/scores": 0.0, "policy/approxkl_avg": 51.44104766845703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48759615421295166, "step": 140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998913288116455 }, { "episode": 2272, "epoch": 0.040838336269187904, "loss/policy_avg": 0.42394816875457764, "lr": 9.909892638036812e-06, "objective/entropy": -52.96235656738281, "objective/kl": 23.680797576904297, "objective/non_score_reward": -2.368079662322998, "objective/rlhf_reward": -7.647490258487771, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 46.429683685302734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7907830476760864, "step": 141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9969936609268188 }, { "episode": 2288, "epoch": 0.04112593018657655, "loss/policy_avg": 0.42378103733062744, "lr": 9.909253578732107e-06, "objective/entropy": -55.83934783935547, "objective/kl": 19.108665466308594, "objective/non_score_reward": -1.910866618156433, "objective/rlhf_reward": -4.7197478159677715, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 87.44278717041016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7084353566169739, "step": 142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.99775230884552 }, { "episode": 2304, "epoch": 0.0414135241039652, "loss/policy_avg": 0.8798868656158447, "lr": 9.908614519427404e-06, "objective/entropy": 397.6534729003906, "objective/kl": 35.663856506347656, "objective/non_score_reward": -3.5663862228393555, "objective/rlhf_reward": -12.318133900837836, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 403.22802734375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9414247274398804, "step": 143, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9968068599700928 }, { "episode": 2320, "epoch": 0.04170111802135385, "loss/policy_avg": 0.8013919591903687, "lr": 9.9079754601227e-06, "objective/entropy": 92.8883285522461, "objective/kl": 23.82331657409668, "objective/non_score_reward": -2.382331609725952, "objective/rlhf_reward": -8.170076781247538, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 73.349609375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6135187149047852, "step": 144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0004608631134033 }, { "episode": 2336, "epoch": 0.0419887119387425, "loss/policy_avg": 2.381652593612671, "lr": 9.907336400817996e-06, "objective/entropy": -49.03077697753906, "objective/kl": 20.151020050048828, "objective/non_score_reward": -2.0151021480560303, "objective/rlhf_reward": -6.5041489293247015, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 37.75957489013672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7855414152145386, "step": 145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0004804134368896 }, { "episode": 2352, "epoch": 0.042276305856131145, "loss/policy_avg": 10.612133026123047, "lr": 9.906697341513293e-06, "objective/entropy": 31.498912811279297, "objective/kl": 22.87250518798828, "objective/non_score_reward": -2.2872507572174072, "objective/rlhf_reward": -7.487143372715103, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 67.64070892333984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6964052319526672, "step": 146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002523422241211 }, { "episode": 2368, "epoch": 0.04256389977351979, "loss/policy_avg": 1.218620777130127, "lr": 9.90605828220859e-06, "objective/entropy": -186.7276153564453, "objective/kl": 17.879009246826172, "objective/non_score_reward": -1.7879009246826172, "objective/rlhf_reward": -2.751603758335113, "objective/scores": 1.1, "policy/approxkl_avg": 62.54943084716797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9248688220977783, "step": 147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999516248703003 }, { "episode": 2384, "epoch": 0.04285149369090844, "loss/policy_avg": 0.08095124363899231, "lr": 9.905419222903886e-06, "objective/entropy": -79.130859375, "objective/kl": 17.005231857299805, "objective/non_score_reward": -1.7005233764648438, "objective/rlhf_reward": -5.378261108596889, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 14.491683959960938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5301992893218994, "step": 148, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981908798217773 }, { "episode": 2400, "epoch": 0.043139087608297086, "loss/policy_avg": 0.9632350206375122, "lr": 9.904780163599183e-06, "objective/entropy": 78.35763549804688, "objective/kl": 20.501068115234375, "objective/non_score_reward": -2.050107002258301, "objective/rlhf_reward": -3.8004277706146237, "objective/scores": 1.1, "policy/approxkl_avg": 42.78419494628906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6539649963378906, "step": 149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986085891723633 }, { "episode": 2416, "epoch": 0.04342668152568573, "loss/policy_avg": 0.2573207914829254, "lr": 9.904141104294478e-06, "objective/entropy": 139.81314086914062, "objective/kl": 15.294164657592773, "objective/non_score_reward": -1.5294163227081299, "objective/rlhf_reward": -4.739063062754971, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 44.518585205078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9615079760551453, "step": 150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004477500915527 }, { "episode": 2432, "epoch": 0.04371427544307438, "loss/policy_avg": 0.1424349546432495, "lr": 9.903502044989775e-06, "objective/entropy": -200.99420166015625, "objective/kl": 23.035701751708984, "objective/non_score_reward": -2.30357027053833, "objective/rlhf_reward": -7.835678675261837, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 147.28530883789062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5872718691825867, "step": 151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000128746032715 }, { "episode": 2448, "epoch": 0.044001869360463026, "loss/policy_avg": 0.2018139809370041, "lr": 9.902862985685072e-06, "objective/entropy": 265.19927978515625, "objective/kl": 22.482166290283203, "objective/non_score_reward": -2.2482166290283203, "objective/rlhf_reward": -7.65123014739099, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 45.75090408325195, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6457411050796509, "step": 152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9972870349884033 }, { "episode": 2464, "epoch": 0.04428946327785167, "loss/policy_avg": 1.6731089353561401, "lr": 9.902223926380369e-06, "objective/entropy": 88.4649658203125, "objective/kl": 23.56682586669922, "objective/non_score_reward": -2.356682300567627, "objective/rlhf_reward": -7.479318688588078, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 46.715396881103516, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5394150614738464, "step": 153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9959262609481812 }, { "episode": 2480, "epoch": 0.04457705719524032, "loss/policy_avg": 0.28844547271728516, "lr": 9.901584867075666e-06, "objective/entropy": 153.28744506835938, "objective/kl": 25.101318359375, "objective/non_score_reward": -2.5101318359375, "objective/rlhf_reward": -8.484268038478449, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 65.64702606201172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48713332414627075, "step": 154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9966403245925903 }, { "episode": 2496, "epoch": 0.04486465111262897, "loss/policy_avg": 0.5096696615219116, "lr": 9.900945807770961e-06, "objective/entropy": 155.93728637695312, "objective/kl": 19.505783081054688, "objective/non_score_reward": -1.9505780935287476, "objective/rlhf_reward": -6.3517142339662165, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 54.11931610107422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5123778581619263, "step": 155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984791278839111 }, { "episode": 2512, "epoch": 0.045152245030017614, "loss/policy_avg": 0.06939780712127686, "lr": 9.900306748466258e-06, "objective/entropy": 18.287960052490234, "objective/kl": 17.8365421295166, "objective/non_score_reward": -1.7836542129516602, "objective/rlhf_reward": -5.792981436758667, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 29.395599365234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5711311101913452, "step": 156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997730255126953 }, { "episode": 2528, "epoch": 0.04543983894740626, "loss/policy_avg": 0.6630462408065796, "lr": 9.899667689161555e-06, "objective/entropy": 63.43556213378906, "objective/kl": 22.572792053222656, "objective/non_score_reward": -2.2572789192199707, "objective/rlhf_reward": -7.687480738669066, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 46.135581970214844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5037804841995239, "step": 157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967372417449951 }, { "episode": 2544, "epoch": 0.04572743286479491, "loss/policy_avg": 0.9545019865036011, "lr": 9.899028629856852e-06, "objective/entropy": 78.440673828125, "objective/kl": 24.54024314880371, "objective/non_score_reward": -2.454024314880371, "objective/rlhf_reward": -8.43749509104858, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 15.233887672424316, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5358680486679077, "step": 158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0002801418304443 }, { "episode": 2560, "epoch": 0.046015026782183555, "loss/policy_avg": 0.7431780099868774, "lr": 9.898389570552149e-06, "objective/entropy": 140.66659545898438, "objective/kl": 15.743444442749023, "objective/non_score_reward": -1.574344515800476, "objective/rlhf_reward": -4.938128196929378, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.185384511947632, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.35770517587661743, "step": 159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9985697269439697 }, { "episode": 2576, "epoch": 0.0463026206995722, "loss/policy_avg": 0.5926499366760254, "lr": 9.897750511247446e-06, "objective/entropy": 142.6728973388672, "objective/kl": 26.324443817138672, "objective/non_score_reward": -2.6324446201324463, "objective/rlhf_reward": -10.529778361320496, "objective/scores": 0.0, "policy/approxkl_avg": 163.95303344726562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6692796945571899, "step": 160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0011281967163086 }, { "episode": 2592, "epoch": 0.04659021461696085, "loss/policy_avg": 0.5165088772773743, "lr": 9.89711145194274e-06, "objective/entropy": -49.34636688232422, "objective/kl": 23.45389175415039, "objective/non_score_reward": -2.3453893661499023, "objective/rlhf_reward": -7.865785443576511, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 21.34342384338379, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6546250581741333, "step": 161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9961342811584473 }, { "episode": 2608, "epoch": 0.046877808534349495, "loss/policy_avg": 0.06444612145423889, "lr": 9.896472392638038e-06, "objective/entropy": 3.992961883544922, "objective/kl": 22.590843200683594, "objective/non_score_reward": -2.259084463119507, "objective/rlhf_reward": -7.636337852478027, "objective/scores": 0.35, "policy/approxkl_avg": 78.1329345703125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6085139513015747, "step": 162, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983890056610107 }, { "episode": 2624, "epoch": 0.04716540245173815, "loss/policy_avg": -0.5813350081443787, "lr": 9.895833333333334e-06, "objective/entropy": 129.43629455566406, "objective/kl": 18.305377960205078, "objective/non_score_reward": -1.8305377960205078, "objective/rlhf_reward": -5.841198506768107, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 35.31721496582031, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5479708313941956, "step": 163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.006561756134033 }, { "episode": 2640, "epoch": 0.047452996369126796, "loss/policy_avg": 0.771713376045227, "lr": 9.895194274028631e-06, "objective/entropy": -7.5702056884765625, "objective/kl": 26.34789276123047, "objective/non_score_reward": -2.634789228439331, "objective/rlhf_reward": -9.023385488780672, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 189.376708984375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7600178122520447, "step": 164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973194599151611 }, { "episode": 2656, "epoch": 0.04774059028651544, "loss/policy_avg": 0.3846855163574219, "lr": 9.894555214723928e-06, "objective/entropy": 208.37188720703125, "objective/kl": 24.167360305786133, "objective/non_score_reward": -2.416736125946045, "objective/rlhf_reward": -8.307694399093075, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 50.201210021972656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8115335702896118, "step": 165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998011589050293 }, { "episode": 2672, "epoch": 0.04802818420390409, "loss/policy_avg": 0.1327829658985138, "lr": 9.893916155419225e-06, "objective/entropy": 137.91152954101562, "objective/kl": 16.795686721801758, "objective/non_score_reward": -1.6795687675476074, "objective/rlhf_reward": -5.202503287585911, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 83.59907531738281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6950229406356812, "step": 166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975529909133911 }, { "episode": 2688, "epoch": 0.048315778121292736, "loss/policy_avg": 0.1259589046239853, "lr": 9.89327709611452e-06, "objective/entropy": -62.611698150634766, "objective/kl": 23.73765754699707, "objective/non_score_reward": -2.373765707015991, "objective/rlhf_reward": -8.095063066482544, "objective/scores": 0.35, "policy/approxkl_avg": 30.360702514648438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4680527448654175, "step": 167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0010440349578857 }, { "episode": 2704, "epoch": 0.04860337203868138, "loss/policy_avg": 1.4625483751296997, "lr": 9.892638036809815e-06, "objective/entropy": 191.9486083984375, "objective/kl": 29.09479331970215, "objective/non_score_reward": -2.9094796180725098, "objective/rlhf_reward": -9.976058488309967, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 96.90817260742188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4573523998260498, "step": 168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999511957168579 }, { "episode": 2720, "epoch": 0.04889096595607003, "loss/policy_avg": 0.39898326992988586, "lr": 9.891998977505112e-06, "objective/entropy": 76.32606506347656, "objective/kl": 25.170101165771484, "objective/non_score_reward": -2.51701021194458, "objective/rlhf_reward": -8.58708846848762, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 109.47344970703125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7625200748443604, "step": 169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9968773126602173 }, { "episode": 2736, "epoch": 0.04917855987345868, "loss/policy_avg": 1.2032477855682373, "lr": 9.89135991820041e-06, "objective/entropy": -85.850830078125, "objective/kl": 23.681184768676758, "objective/non_score_reward": -2.3681185245513916, "objective/rlhf_reward": -7.916214673724726, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 8.655494689941406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7028899788856506, "step": 170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0086746215820312 }, { "episode": 2752, "epoch": 0.049466153790847324, "loss/policy_avg": 0.2989116311073303, "lr": 9.890720858895706e-06, "objective/entropy": -244.5385284423828, "objective/kl": 25.30226707458496, "objective/non_score_reward": -2.530226707458496, "objective/rlhf_reward": -8.74230466136108, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 80.04335021972656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7547532320022583, "step": 171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9984104633331299 }, { "episode": 2768, "epoch": 0.04975374770823597, "loss/policy_avg": 0.15676680207252502, "lr": 9.890081799591003e-06, "objective/entropy": -46.70368576049805, "objective/kl": 23.916561126708984, "objective/non_score_reward": -2.3916563987731934, "objective/rlhf_reward": -8.142793853481379, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 98.60531616210938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4980742335319519, "step": 172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000648021697998 }, { "episode": 2784, "epoch": 0.05004134162562462, "loss/policy_avg": 2.26867413520813, "lr": 9.8894427402863e-06, "objective/entropy": -103.40653991699219, "objective/kl": 13.933716773986816, "objective/non_score_reward": -1.39337158203125, "objective/rlhf_reward": -3.969366837207394, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 48.939292907714844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5038570165634155, "step": 173, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992566108703613 }, { "episode": 2800, "epoch": 0.050328935543013265, "loss/policy_avg": 2.38254714012146, "lr": 9.888803680981595e-06, "objective/entropy": 186.43983459472656, "objective/kl": 22.118637084960938, "objective/non_score_reward": -2.2118635177612305, "objective/rlhf_reward": -7.5219414568244645, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 46.10081481933594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5202944278717041, "step": 174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9985566139221191 }, { "episode": 2816, "epoch": 0.05061652946040191, "loss/policy_avg": 0.7353519201278687, "lr": 9.888164621676892e-06, "objective/entropy": 40.153011322021484, "objective/kl": 24.53411865234375, "objective/non_score_reward": -2.453411817550659, "objective/rlhf_reward": -8.413647150993347, "objective/scores": 0.35, "policy/approxkl_avg": 73.58890533447266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5293036699295044, "step": 175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974313974380493 }, { "episode": 2832, "epoch": 0.05090412337779056, "loss/policy_avg": 0.09056591242551804, "lr": 9.887525562372189e-06, "objective/entropy": 271.1833190917969, "objective/kl": 18.394960403442383, "objective/non_score_reward": -1.83949613571167, "objective/rlhf_reward": -5.8770319251374, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 106.73664855957031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.683214545249939, "step": 176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000105619430542 }, { "episode": 2848, "epoch": 0.051191717295179205, "loss/policy_avg": 1.4375742673873901, "lr": 9.886886503067486e-06, "objective/entropy": 21.05878448486328, "objective/kl": 19.652141571044922, "objective/non_score_reward": -1.9652140140533447, "objective/rlhf_reward": -5.913445363717015, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 19.185291290283203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5441437363624573, "step": 177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982903003692627 }, { "episode": 2864, "epoch": 0.05147931121256785, "loss/policy_avg": 0.9233704209327698, "lr": 9.886247443762783e-06, "objective/entropy": 79.99378204345703, "objective/kl": 30.930316925048828, "objective/non_score_reward": -3.093031644821167, "objective/rlhf_reward": -10.42471582718366, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 71.49031829833984, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4192318320274353, "step": 178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994785785675049 }, { "episode": 2880, "epoch": 0.0517669051299565, "loss/policy_avg": 0.8243035078048706, "lr": 9.88560838445808e-06, "objective/entropy": -79.08426666259766, "objective/kl": 21.69200897216797, "objective/non_score_reward": -2.169200897216797, "objective/rlhf_reward": -7.072683844629841, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 111.395263671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5561103820800781, "step": 179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990267753601074 }, { "episode": 2896, "epoch": 0.052054499047345146, "loss/policy_avg": 0.20029950141906738, "lr": 9.884969325153375e-06, "objective/entropy": 29.943138122558594, "objective/kl": 23.813167572021484, "objective/non_score_reward": -2.381316661834717, "objective/rlhf_reward": -9.525267362594604, "objective/scores": 0.0, "policy/approxkl_avg": 42.68115997314453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6918896436691284, "step": 180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9963979721069336 }, { "episode": 2912, "epoch": 0.05234209296473379, "loss/policy_avg": 0.014301195740699768, "lr": 9.884330265848671e-06, "objective/entropy": 164.71829223632812, "objective/kl": 24.91703224182129, "objective/non_score_reward": -2.4917030334472656, "objective/rlhf_reward": -7.84410613991407, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 178.1289520263672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.43327000737190247, "step": 181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9986697435379028 }, { "episode": 2928, "epoch": 0.052629686882122446, "loss/policy_avg": 0.9095668792724609, "lr": 9.883691206543968e-06, "objective/entropy": 107.3670883178711, "objective/kl": 29.20984649658203, "objective/non_score_reward": -2.920984983444214, "objective/rlhf_reward": -10.079820070330221, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 77.46437072753906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6489860415458679, "step": 182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9962208271026611 }, { "episode": 2944, "epoch": 0.05291728079951109, "loss/policy_avg": 0.2735748589038849, "lr": 9.883052147239265e-06, "objective/entropy": 22.112346649169922, "objective/kl": 20.8614444732666, "objective/non_score_reward": -2.08614444732666, "objective/rlhf_reward": -6.682718520582306, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 121.54977416992188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5333126187324524, "step": 183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991374015808105 }, { "episode": 2960, "epoch": 0.05320487471689974, "loss/policy_avg": 0.32464680075645447, "lr": 9.882413087934562e-06, "objective/entropy": 188.4505615234375, "objective/kl": 20.14493179321289, "objective/non_score_reward": -2.014493227005005, "objective/rlhf_reward": -6.542201125415501, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 49.658721923828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7087539434432983, "step": 184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0017189979553223 }, { "episode": 2976, "epoch": 0.05349246863428839, "loss/policy_avg": 0.6234300136566162, "lr": 9.881774028629857e-06, "objective/entropy": -28.301137924194336, "objective/kl": 22.407699584960938, "objective/non_score_reward": -2.240769863128662, "objective/rlhf_reward": -7.406820564475611, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.418500900268555, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48261120915412903, "step": 185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975056648254395 }, { "episode": 2992, "epoch": 0.053780062551677034, "loss/policy_avg": 0.1302778124809265, "lr": 9.881134969325154e-06, "objective/entropy": 163.4349365234375, "objective/kl": 31.70010757446289, "objective/non_score_reward": -3.170010566711426, "objective/rlhf_reward": -10.280042505264282, "objective/scores": 0.6, "policy/approxkl_avg": 74.95491790771484, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.584516167640686, "step": 186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998785138130188 }, { "episode": 3008, "epoch": 0.05406765646906568, "loss/policy_avg": 0.22809255123138428, "lr": 9.880495910020451e-06, "objective/entropy": 209.2935333251953, "objective/kl": 20.395681381225586, "objective/non_score_reward": -2.0395681858062744, "objective/rlhf_reward": -6.816636970549254, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 63.878265380859375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6551488637924194, "step": 187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995040893554688 }, { "episode": 3024, "epoch": 0.05435525038645433, "loss/policy_avg": 0.03126790001988411, "lr": 9.879856850715748e-06, "objective/entropy": 4.618324279785156, "objective/kl": 24.382261276245117, "objective/non_score_reward": -2.4382262229919434, "objective/rlhf_reward": -8.237132870944675, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.476547718048096, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5421229600906372, "step": 188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999804496765137 }, { "episode": 3040, "epoch": 0.054642844303842975, "loss/policy_avg": 0.3424449563026428, "lr": 9.879217791411043e-06, "objective/entropy": -136.56385803222656, "objective/kl": 33.151222229003906, "objective/non_score_reward": -3.315122127532959, "objective/rlhf_reward": -11.901238882277887, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 49.403472900390625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5513160228729248, "step": 189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999922752380371 }, { "episode": 3056, "epoch": 0.05493043822123162, "loss/policy_avg": 0.8403773307800293, "lr": 9.87857873210634e-06, "objective/entropy": 322.4634704589844, "objective/kl": 26.295257568359375, "objective/non_score_reward": -2.629525661468506, "objective/rlhf_reward": -9.192590031653566, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 115.08587646484375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7622972130775452, "step": 190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9968273639678955 }, { "episode": 3072, "epoch": 0.05521803213862027, "loss/policy_avg": 0.19895562529563904, "lr": 9.877939672801637e-06, "objective/entropy": -226.75164794921875, "objective/kl": 16.52016830444336, "objective/non_score_reward": -1.6520167589187622, "objective/rlhf_reward": -4.78323804882438, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 43.135128021240234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48229825496673584, "step": 191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.003657341003418 }, { "episode": 3088, "epoch": 0.055505626056008915, "loss/policy_avg": 0.132611945271492, "lr": 9.877300613496934e-06, "objective/entropy": 138.95777893066406, "objective/kl": 28.13532257080078, "objective/non_score_reward": -2.8135323524475098, "objective/rlhf_reward": -9.830297310550776, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.711568832397461, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6496654748916626, "step": 192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994869232177734 }, { "episode": 3104, "epoch": 0.05579321997339756, "loss/policy_avg": 0.13647544384002686, "lr": 9.876661554192229e-06, "objective/entropy": 228.907958984375, "objective/kl": 20.958343505859375, "objective/non_score_reward": -2.095834255218506, "objective/rlhf_reward": -6.435925911145146, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 77.2052230834961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5815694332122803, "step": 193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998384714126587 }, { "episode": 3120, "epoch": 0.05608081389078621, "loss/policy_avg": 0.3623042702674866, "lr": 9.876022494887526e-06, "objective/entropy": 101.04753112792969, "objective/kl": 28.680049896240234, "objective/non_score_reward": -2.8680050373077393, "objective/rlhf_reward": -10.021421532245025, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 129.59266662597656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5827709436416626, "step": 194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990119934082031 }, { "episode": 3136, "epoch": 0.056368407808174856, "loss/policy_avg": -0.07424932718276978, "lr": 9.875383435582823e-06, "objective/entropy": 245.4013671875, "objective/kl": 20.346391677856445, "objective/non_score_reward": -2.034639358520508, "objective/rlhf_reward": -6.01585108257917, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 31.92734718322754, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6253474950790405, "step": 195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9984948635101318 }, { "episode": 3152, "epoch": 0.0566560017255635, "loss/policy_avg": 0.1401011198759079, "lr": 9.87474437627812e-06, "objective/entropy": 375.89263916015625, "objective/kl": 21.685848236083984, "objective/non_score_reward": -2.1685848236083984, "objective/rlhf_reward": -6.274339175224304, "objective/scores": 0.6, "policy/approxkl_avg": 131.50494384765625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.9226024746894836, "step": 196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0007243156433105 }, { "episode": 3168, "epoch": 0.05694359564295215, "loss/policy_avg": -0.030730588361620903, "lr": 9.874105316973416e-06, "objective/entropy": 140.85540771484375, "objective/kl": 20.57616424560547, "objective/non_score_reward": -2.0576162338256836, "objective/rlhf_reward": -6.779867152781829, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 33.19469451904297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7271202206611633, "step": 197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000385284423828 }, { "episode": 3184, "epoch": 0.057231189560340796, "loss/policy_avg": 2.7618093490600586, "lr": 9.873466257668712e-06, "objective/entropy": 179.97198486328125, "objective/kl": 28.560035705566406, "objective/non_score_reward": -2.856003761291504, "objective/rlhf_reward": -9.599186535152505, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 32.35374450683594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.653136134147644, "step": 198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997452974319458 }, { "episode": 3200, "epoch": 0.05751878347772944, "loss/policy_avg": 0.6520799398422241, "lr": 9.872827198364009e-06, "objective/entropy": 110.88057708740234, "objective/kl": 31.026592254638672, "objective/non_score_reward": -3.1026594638824463, "objective/rlhf_reward": -11.06900267890039, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 33.437408447265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5983192324638367, "step": 199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984240531921387 }, { "episode": 3216, "epoch": 0.05780637739511809, "loss/policy_avg": 0.19128543138504028, "lr": 9.872188139059305e-06, "objective/entropy": 234.22332763671875, "objective/kl": 33.926361083984375, "objective/non_score_reward": -3.3926358222961426, "objective/rlhf_reward": -12.119944910617217, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 190.18153381347656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9237314462661743, "step": 200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9989691972732544 }, { "episode": 3232, "epoch": 0.058093971312506744, "loss/policy_avg": 0.04767340421676636, "lr": 9.871549079754602e-06, "objective/entropy": -7.490440368652344, "objective/kl": 16.179231643676758, "objective/non_score_reward": -1.6179232597351074, "objective/rlhf_reward": -4.348986568228279, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 3.6666202545166016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5502414703369141, "step": 201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998970031738281 }, { "episode": 3248, "epoch": 0.05838156522989539, "loss/policy_avg": 0.009956400841474533, "lr": 9.8709100204499e-06, "objective/entropy": 184.87599182128906, "objective/kl": 30.518714904785156, "objective/non_score_reward": -3.0518715381622314, "objective/rlhf_reward": -10.865850737600951, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 144.38037109375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46582168340682983, "step": 202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0004916191101074 }, { "episode": 3264, "epoch": 0.05866915914728404, "loss/policy_avg": 0.8650859594345093, "lr": 9.870270961145196e-06, "objective/entropy": 60.665279388427734, "objective/kl": 30.722930908203125, "objective/non_score_reward": -3.0722928047180176, "objective/rlhf_reward": -10.80821907799995, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 197.62728881835938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.45192649960517883, "step": 203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9978991746902466 }, { "episode": 3280, "epoch": 0.058956753064672685, "loss/policy_avg": 0.7753949165344238, "lr": 9.869631901840491e-06, "objective/entropy": 224.53439331054688, "objective/kl": 35.50615692138672, "objective/non_score_reward": -3.5506153106689453, "objective/rlhf_reward": -9.80246195793152, "objective/scores": 1.1, "policy/approxkl_avg": 207.07131958007812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625594973564148, "step": 204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.003218650817871 }, { "episode": 3296, "epoch": 0.05924434698206133, "loss/policy_avg": 0.12218689173460007, "lr": 9.868992842535788e-06, "objective/entropy": 160.02056884765625, "objective/kl": 20.542434692382812, "objective/non_score_reward": -2.054243326187134, "objective/rlhf_reward": -8.216973185539246, "objective/scores": 0.0, "policy/approxkl_avg": 27.70839500427246, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5086226463317871, "step": 205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990867376327515 }, { "episode": 3312, "epoch": 0.05953194089944998, "loss/policy_avg": 0.16328184306621552, "lr": 9.868353783231085e-06, "objective/entropy": -178.42849731445312, "objective/kl": 12.709222793579102, "objective/non_score_reward": -1.270922303199768, "objective/rlhf_reward": -0.6836892724037167, "objective/scores": 1.1, "policy/approxkl_avg": 175.85543823242188, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6555081605911255, "step": 206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003648281097412 }, { "episode": 3328, "epoch": 0.059819534816838625, "loss/policy_avg": 0.3191947340965271, "lr": 9.867714723926382e-06, "objective/entropy": 113.40653991699219, "objective/kl": 23.92019271850586, "objective/non_score_reward": -2.392019271850586, "objective/rlhf_reward": -8.011817782130793, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 120.01425170898438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6426678895950317, "step": 207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998692274093628 }, { "episode": 3344, "epoch": 0.06010712873422727, "loss/policy_avg": 0.8106866478919983, "lr": 9.867075664621679e-06, "objective/entropy": -161.32217407226562, "objective/kl": 20.696407318115234, "objective/non_score_reward": -2.069640874862671, "objective/rlhf_reward": -6.899961569396359, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 60.63603973388672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.424863338470459, "step": 208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001417636871338 }, { "episode": 3360, "epoch": 0.06039472265161592, "loss/policy_avg": 0.5404326915740967, "lr": 9.866436605316974e-06, "objective/entropy": 136.03414916992188, "objective/kl": 24.140501022338867, "objective/non_score_reward": -2.414050340652466, "objective/rlhf_reward": -5.256201243400573, "objective/scores": 1.1, "policy/approxkl_avg": 77.51190948486328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6469956040382385, "step": 209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992618560791016 }, { "episode": 3376, "epoch": 0.060682316569004566, "loss/policy_avg": -0.004204496741294861, "lr": 9.86579754601227e-06, "objective/entropy": -205.11416625976562, "objective/kl": 22.115215301513672, "objective/non_score_reward": -2.211521625518799, "objective/rlhf_reward": -7.3954881235078425, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 130.94525146484375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7604430913925171, "step": 210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9986162185668945 }, { "episode": 3392, "epoch": 0.06096991048639321, "loss/policy_avg": 0.10069486498832703, "lr": 9.865158486707568e-06, "objective/entropy": 38.40431213378906, "objective/kl": 21.107707977294922, "objective/non_score_reward": -2.1107707023620605, "objective/rlhf_reward": -6.962130549366831, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 48.98419189453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.43749940395355225, "step": 211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9993665218353271 }, { "episode": 3408, "epoch": 0.06125750440378186, "loss/policy_avg": 0.013450137339532375, "lr": 9.864519427402863e-06, "objective/entropy": 97.07965087890625, "objective/kl": 26.950225830078125, "objective/non_score_reward": -2.6950225830078125, "objective/rlhf_reward": -8.955262298854898, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 44.33604431152344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5165296792984009, "step": 212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978928565979004 }, { "episode": 3424, "epoch": 0.061545098321170506, "loss/policy_avg": 0.4735873341560364, "lr": 9.86388036809816e-06, "objective/entropy": -1.7870521545410156, "objective/kl": 27.062910079956055, "objective/non_score_reward": -2.7062911987304688, "objective/rlhf_reward": -9.465914988253992, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 72.81141662597656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6805305480957031, "step": 213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998960018157959 }, { "episode": 3440, "epoch": 0.06183269223855915, "loss/policy_avg": 0.09523998200893402, "lr": 9.863241308793457e-06, "objective/entropy": 32.18935012817383, "objective/kl": 9.85006046295166, "objective/non_score_reward": -0.9850060939788818, "objective/rlhf_reward": -2.5161924554901995, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.0238242149353027, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3841710090637207, "step": 214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986772537231445 }, { "episode": 3456, "epoch": 0.0621202861559478, "loss/policy_avg": 0.720879316329956, "lr": 9.862602249488753e-06, "objective/entropy": 276.2146301269531, "objective/kl": 28.97698974609375, "objective/non_score_reward": -2.8976993560791016, "objective/rlhf_reward": -9.928937201917755, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 191.53884887695312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7353510856628418, "step": 215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9999027252197266 }, { "episode": 3472, "epoch": 0.06240788007333645, "loss/policy_avg": 0.5507330894470215, "lr": 9.86196319018405e-06, "objective/entropy": 250.835693359375, "objective/kl": 29.98652458190918, "objective/non_score_reward": -2.998652219772339, "objective/rlhf_reward": -10.332749491155731, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 85.02761840820312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6071260571479797, "step": 216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9968819618225098 }, { "episode": 3488, "epoch": 0.0626954739907251, "loss/policy_avg": 0.9385891556739807, "lr": 9.861324130879346e-06, "objective/entropy": 82.53084564208984, "objective/kl": 26.54790687561035, "objective/non_score_reward": -2.6547906398773193, "objective/rlhf_reward": -8.496456088797125, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 24.03960609436035, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4867916703224182, "step": 217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992834329605103 }, { "episode": 3504, "epoch": 0.06298306790811374, "loss/policy_avg": 0.3534790575504303, "lr": 9.860685071574642e-06, "objective/entropy": 230.29193115234375, "objective/kl": 21.73017120361328, "objective/non_score_reward": -2.1730172634124756, "objective/rlhf_reward": -6.867240424427102, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 81.97232055664062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.558746337890625, "step": 218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9974663257598877 }, { "episode": 3520, "epoch": 0.0632706618255024, "loss/policy_avg": -0.15977555513381958, "lr": 9.86004601226994e-06, "objective/entropy": 113.71033477783203, "objective/kl": 17.67473030090332, "objective/non_score_reward": -1.7674732208251953, "objective/rlhf_reward": -4.947186651007209, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 94.58512115478516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5372532606124878, "step": 219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001951217651367 }, { "episode": 3536, "epoch": 0.06355825574289103, "loss/policy_avg": 0.9559342265129089, "lr": 9.859406952965236e-06, "objective/entropy": 173.58860778808594, "objective/kl": 33.72608947753906, "objective/non_score_reward": -3.3726086616516113, "objective/rlhf_reward": -11.367727937475713, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 6.491452217102051, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6636508703231812, "step": 220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9996795654296875 }, { "episode": 3552, "epoch": 0.06384584966027969, "loss/policy_avg": -0.854604184627533, "lr": 9.858767893660533e-06, "objective/entropy": -67.233154296875, "objective/kl": 13.420427322387695, "objective/non_score_reward": -1.3420426845550537, "objective/rlhf_reward": -2.4444515898239345, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 72.4083251953125, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.8086908459663391, "step": 221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0127267837524414 }, { "episode": 3568, "epoch": 0.06413344357766833, "loss/policy_avg": 0.5410902500152588, "lr": 9.858128834355828e-06, "objective/entropy": 175.63470458984375, "objective/kl": 35.907081604003906, "objective/non_score_reward": -3.5907082557678223, "objective/rlhf_reward": -13.037319932013673, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 99.65482330322266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6278276443481445, "step": 222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999873399734497 }, { "episode": 3584, "epoch": 0.06442103749505698, "loss/policy_avg": 0.3871188163757324, "lr": 9.857489775051125e-06, "objective/entropy": -161.75840759277344, "objective/kl": 18.288314819335938, "objective/non_score_reward": -1.8288315534591675, "objective/rlhf_reward": -5.799554431232151, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 110.67633056640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6733036041259766, "step": 223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991552829742432 }, { "episode": 3600, "epoch": 0.06470863141244562, "loss/policy_avg": -0.573300838470459, "lr": 9.856850715746422e-06, "objective/entropy": 6.650520324707031, "objective/kl": 26.58426284790039, "objective/non_score_reward": -2.658426284790039, "objective/rlhf_reward": -7.709986720920774, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 44.56170654296875, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.38651180267333984, "step": 224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0037808418273926 }, { "episode": 3616, "epoch": 0.06499622532983428, "loss/policy_avg": 0.5340808629989624, "lr": 9.856211656441719e-06, "objective/entropy": 59.36520004272461, "objective/kl": 28.841266632080078, "objective/non_score_reward": -2.884126901626587, "objective/rlhf_reward": -9.874647860944854, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 53.19476318359375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6743514537811279, "step": 225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9957829713821411 }, { "episode": 3632, "epoch": 0.06528381924722292, "loss/policy_avg": 0.5914766192436218, "lr": 9.855572597137016e-06, "objective/entropy": 228.81517028808594, "objective/kl": 30.393442153930664, "objective/non_score_reward": -3.039344310760498, "objective/rlhf_reward": -10.209965775685246, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 85.6346435546875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5778177976608276, "step": 226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9993802309036255 }, { "episode": 3648, "epoch": 0.06557141316461157, "loss/policy_avg": -0.05053609609603882, "lr": 9.854933537832313e-06, "objective/entropy": 13.725364685058594, "objective/kl": 25.695791244506836, "objective/non_score_reward": -2.5695791244506836, "objective/rlhf_reward": -8.330905745701726, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 135.800048828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3210442066192627, "step": 227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997122287750244 }, { "episode": 3664, "epoch": 0.06585900708200021, "loss/policy_avg": 0.4539129137992859, "lr": 9.854294478527608e-06, "objective/entropy": 93.42439270019531, "objective/kl": 30.396175384521484, "objective/non_score_reward": -3.0396177768707275, "objective/rlhf_reward": -10.779868462172846, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 55.461158752441406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6131182909011841, "step": 228, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976425170898438 }, { "episode": 3680, "epoch": 0.06614660099938886, "loss/policy_avg": 0.228049173951149, "lr": 9.853655419222905e-06, "objective/entropy": -28.055843353271484, "objective/kl": 23.269084930419922, "objective/non_score_reward": -2.326908588409424, "objective/rlhf_reward": -6.383914981723997, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 143.5833740234375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5752028822898865, "step": 229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000626802444458 }, { "episode": 3696, "epoch": 0.06643419491677752, "loss/policy_avg": 0.10666107386350632, "lr": 9.853016359918202e-06, "objective/entropy": 74.64518737792969, "objective/kl": 32.4399528503418, "objective/non_score_reward": -3.243995189666748, "objective/rlhf_reward": -11.525382618518218, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 181.31935119628906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5801441669464111, "step": 230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995744228363037 }, { "episode": 3712, "epoch": 0.06672178883416616, "loss/policy_avg": 2.4466023445129395, "lr": 9.852377300613498e-06, "objective/entropy": 244.4732666015625, "objective/kl": 27.413360595703125, "objective/non_score_reward": -2.7413363456726074, "objective/rlhf_reward": -9.14051639583976, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 85.86346435546875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8622180223464966, "step": 231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987142086029053 }, { "episode": 3728, "epoch": 0.06700938275155481, "loss/policy_avg": 0.8113258481025696, "lr": 9.851738241308795e-06, "objective/entropy": 56.00733947753906, "objective/kl": 21.946327209472656, "objective/non_score_reward": -2.1946325302124023, "objective/rlhf_reward": -7.419280850623531, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 29.368534088134766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.45428696274757385, "step": 232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980647563934326 }, { "episode": 3744, "epoch": 0.06729697666894345, "loss/policy_avg": 0.2869613766670227, "lr": 9.85109918200409e-06, "objective/entropy": 128.71649169921875, "objective/kl": 21.821929931640625, "objective/non_score_reward": -2.182192802429199, "objective/rlhf_reward": -7.278173069568023, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 72.65187072753906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8492765426635742, "step": 233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982428550720215 }, { "episode": 3760, "epoch": 0.0675845705863321, "loss/policy_avg": 1.1545510292053223, "lr": 9.850460122699387e-06, "objective/entropy": -46.38230895996094, "objective/kl": 28.68572235107422, "objective/non_score_reward": -2.868572235107422, "objective/rlhf_reward": -10.050457079609004, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 29.78200912475586, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.661322295665741, "step": 234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991018772125244 }, { "episode": 3776, "epoch": 0.06787216450372074, "loss/policy_avg": 0.7958990335464478, "lr": 9.849821063394683e-06, "objective/entropy": 157.34841918945312, "objective/kl": 28.915939331054688, "objective/non_score_reward": -2.8915936946868896, "objective/rlhf_reward": -10.240862164527101, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 46.19620895385742, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6968529224395752, "step": 235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994633197784424 }, { "episode": 3792, "epoch": 0.0681597584211094, "loss/policy_avg": 0.6319503784179688, "lr": 9.84918200408998e-06, "objective/entropy": 356.89532470703125, "objective/kl": 28.920034408569336, "objective/non_score_reward": -2.8920035362243652, "objective/rlhf_reward": -10.144182522495356, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 15.02867317199707, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.926424503326416, "step": 236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9992319345474243 }, { "episode": 3808, "epoch": 0.06844735233849804, "loss/policy_avg": 0.29689115285873413, "lr": 9.848542944785276e-06, "objective/entropy": -114.8179931640625, "objective/kl": 22.912490844726562, "objective/non_score_reward": -2.2912492752075195, "objective/rlhf_reward": -7.649225318225559, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.519531726837158, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4792546033859253, "step": 237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990766048431396 }, { "episode": 3824, "epoch": 0.06873494625588669, "loss/policy_avg": 0.6142581701278687, "lr": 9.847903885480573e-06, "objective/entropy": 42.130271911621094, "objective/kl": 30.74860382080078, "objective/non_score_reward": -3.0748605728149414, "objective/rlhf_reward": -10.920839407531124, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 37.97405242919922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4455175995826721, "step": 238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975383281707764 }, { "episode": 3840, "epoch": 0.06902254017327533, "loss/policy_avg": 0.03958883881568909, "lr": 9.84726482617587e-06, "objective/entropy": 148.7663116455078, "objective/kl": 24.86724853515625, "objective/non_score_reward": -2.486724615097046, "objective/rlhf_reward": -8.56829617270599, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 20.696613311767578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7744324207305908, "step": 239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0034701824188232 }, { "episode": 3856, "epoch": 0.06931013409066399, "loss/policy_avg": -0.12924179434776306, "lr": 9.846625766871167e-06, "objective/entropy": 13.191347122192383, "objective/kl": 36.86333465576172, "objective/non_score_reward": -3.686333179473877, "objective/rlhf_reward": -11.821614180446836, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 93.72460174560547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7017860412597656, "step": 240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996403455734253 }, { "episode": 3872, "epoch": 0.06959772800805263, "loss/policy_avg": 0.6671891212463379, "lr": 9.845986707566462e-06, "objective/entropy": 144.81239318847656, "objective/kl": 25.728496551513672, "objective/non_score_reward": -2.572849750518799, "objective/rlhf_reward": -8.168692888990913, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 24.799148559570312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5449861288070679, "step": 241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000767707824707 }, { "episode": 3888, "epoch": 0.06988532192544128, "loss/policy_avg": 1.4478445053100586, "lr": 9.845347648261759e-06, "objective/entropy": -13.714214324951172, "objective/kl": 31.57904052734375, "objective/non_score_reward": -3.1579039096832275, "objective/rlhf_reward": -11.207783777912226, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 7.07413387298584, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5770883560180664, "step": 242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.997456669807434 }, { "episode": 3904, "epoch": 0.07017291584282992, "loss/policy_avg": -0.1629352867603302, "lr": 9.844708588957056e-06, "objective/entropy": 150.56808471679688, "objective/kl": 22.077739715576172, "objective/non_score_reward": -2.2077741622924805, "objective/rlhf_reward": -8.831096112728119, "objective/scores": 0.0, "policy/approxkl_avg": 6.039865970611572, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4642740786075592, "step": 243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.013947010040283 }, { "episode": 3920, "epoch": 0.07046050976021857, "loss/policy_avg": 4.2705912590026855, "lr": 9.844069529652353e-06, "objective/entropy": -73.61671447753906, "objective/kl": 27.2436580657959, "objective/non_score_reward": -2.724365711212158, "objective/rlhf_reward": -9.072634573253701, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.7233123779296875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5502868294715881, "step": 244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990230798721313 }, { "episode": 3936, "epoch": 0.07074810367760721, "loss/policy_avg": 0.09502686560153961, "lr": 9.84343047034765e-06, "objective/entropy": 38.153350830078125, "objective/kl": 25.953601837158203, "objective/non_score_reward": -2.595360040664673, "objective/rlhf_reward": -8.434029231743748, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 183.2377471923828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8148726224899292, "step": 245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9989858865737915 }, { "episode": 3952, "epoch": 0.07103569759499587, "loss/policy_avg": 0.36105144023895264, "lr": 9.842791411042945e-06, "objective/entropy": 46.69014358520508, "objective/kl": 24.270606994628906, "objective/non_score_reward": -2.427060842514038, "objective/rlhf_reward": -8.329641216484408, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 23.915287017822266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4709934592247009, "step": 246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001595973968506 }, { "episode": 3968, "epoch": 0.0713232915123845, "loss/policy_avg": 0.3951423168182373, "lr": 9.842152351738242e-06, "objective/entropy": 0.16453170776367188, "objective/kl": 27.542736053466797, "objective/non_score_reward": -2.7542738914489746, "objective/rlhf_reward": -11.017095446586609, "objective/scores": 0.0, "policy/approxkl_avg": 11.038375854492188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6403580904006958, "step": 247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978313446044922 }, { "episode": 3984, "epoch": 0.07161088542977316, "loss/policy_avg": 0.2933734655380249, "lr": 9.841513292433539e-06, "objective/entropy": -41.10125732421875, "objective/kl": 25.373741149902344, "objective/non_score_reward": -2.5373740196228027, "objective/rlhf_reward": -8.770894267646176, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 14.429267883300781, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6396682262420654, "step": 248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0005202293395996 }, { "episode": 4000, "epoch": 0.07189847934716181, "loss/policy_avg": 0.24670132994651794, "lr": 9.840874233128836e-06, "objective/entropy": -35.8713264465332, "objective/kl": 30.457420349121094, "objective/non_score_reward": -3.0457420349121094, "objective/rlhf_reward": -10.449634091059366, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 79.78580474853516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.41042375564575195, "step": 249, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9961113929748535 }, { "episode": 4016, "epoch": 0.07218607326455045, "loss/policy_avg": 0.017466381192207336, "lr": 9.840235173824132e-06, "objective/entropy": 87.24893188476562, "objective/kl": 17.873748779296875, "objective/non_score_reward": -1.7873749732971191, "objective/rlhf_reward": -5.026793541685615, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 83.72406005859375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7392317056655884, "step": 250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985733032226562 }, { "episode": 4032, "epoch": 0.07247366718193911, "loss/policy_avg": 0.209593266248703, "lr": 9.83959611451943e-06, "objective/entropy": -10.21453857421875, "objective/kl": 26.26023292541504, "objective/non_score_reward": -2.626023292541504, "objective/rlhf_reward": -9.104092931747438, "objective/scores": 0.35, "policy/approxkl_avg": 2.64996337890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5523943901062012, "step": 251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995850324630737 }, { "episode": 4048, "epoch": 0.07276126109932775, "loss/policy_avg": 0.5933290719985962, "lr": 9.838957055214724e-06, "objective/entropy": -18.139259338378906, "objective/kl": 29.199474334716797, "objective/non_score_reward": -2.9199471473693848, "objective/rlhf_reward": -10.27978894710541, "objective/scores": 0.35, "policy/approxkl_avg": 50.652503967285156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5432610511779785, "step": 252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9982495307922363 }, { "episode": 4064, "epoch": 0.0730488550167164, "loss/policy_avg": 1.320284366607666, "lr": 9.838317995910021e-06, "objective/entropy": -10.506271362304688, "objective/kl": 28.47583770751953, "objective/non_score_reward": -2.847583532333374, "objective/rlhf_reward": -9.44292337723249, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 83.18882751464844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5443971157073975, "step": 253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997657299041748 }, { "episode": 4080, "epoch": 0.07333644893410504, "loss/policy_avg": -0.02555149793624878, "lr": 9.837678936605318e-06, "objective/entropy": -81.56509399414062, "objective/kl": 15.26602840423584, "objective/non_score_reward": -1.526602864265442, "objective/rlhf_reward": -4.281582649025034, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 6.358033657073975, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6404141187667847, "step": 254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0027146339416504 }, { "episode": 4096, "epoch": 0.0736240428514937, "loss/policy_avg": 0.4154921770095825, "lr": 9.837039877300615e-06, "objective/entropy": -86.56658935546875, "objective/kl": 15.54503059387207, "objective/non_score_reward": -1.5545029640197754, "objective/rlhf_reward": -4.393183226856302, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 36.390655517578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9074146747589111, "step": 255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989664554595947 }, { "episode": 4112, "epoch": 0.07391163676888234, "loss/policy_avg": -0.2038569152355194, "lr": 9.83640081799591e-06, "objective/entropy": -80.65778350830078, "objective/kl": 20.036571502685547, "objective/non_score_reward": -2.003657102584839, "objective/rlhf_reward": -6.189799661907266, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 28.666210174560547, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7191000580787659, "step": 256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999082088470459 }, { "episode": 4128, "epoch": 0.07419923068627099, "loss/policy_avg": 0.5487632751464844, "lr": 9.835761758691207e-06, "objective/entropy": 64.21192932128906, "objective/kl": 25.55659294128418, "objective/non_score_reward": -2.555659294128418, "objective/rlhf_reward": -8.79880495806512, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 74.83338928222656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7529090046882629, "step": 257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991776943206787 }, { "episode": 4144, "epoch": 0.07448682460365963, "loss/policy_avg": 0.8301103115081787, "lr": 9.835122699386504e-06, "objective/entropy": 152.20065307617188, "objective/kl": 26.725215911865234, "objective/non_score_reward": -2.6725215911865234, "objective/rlhf_reward": -10.690086603164673, "objective/scores": 0.0, "policy/approxkl_avg": 86.60305786132812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3636325001716614, "step": 258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.996506929397583 }, { "episode": 4160, "epoch": 0.07477441852104828, "loss/policy_avg": 0.6052212119102478, "lr": 9.8344836400818e-06, "objective/entropy": 92.0700454711914, "objective/kl": 20.43947982788086, "objective/non_score_reward": -2.043948173522949, "objective/rlhf_reward": -3.775792723894119, "objective/scores": 1.1, "policy/approxkl_avg": 6.338429927825928, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4831230342388153, "step": 259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987390041351318 }, { "episode": 4176, "epoch": 0.07506201243843692, "loss/policy_avg": 0.33531126379966736, "lr": 9.833844580777096e-06, "objective/entropy": 103.8875732421875, "objective/kl": 41.16206741333008, "objective/non_score_reward": -4.116207122802734, "objective/rlhf_reward": -14.860707316462118, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 157.35191345214844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5286747217178345, "step": 260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989887475967407 }, { "episode": 4192, "epoch": 0.07534960635582558, "loss/policy_avg": 0.8983044624328613, "lr": 9.833205521472393e-06, "objective/entropy": -19.21771812438965, "objective/kl": 27.187969207763672, "objective/non_score_reward": -2.718796968460083, "objective/rlhf_reward": -8.927777002530036, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 116.0262451171875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5258731842041016, "step": 261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985415935516357 }, { "episode": 4208, "epoch": 0.07563720027321422, "loss/policy_avg": 0.3744966983795166, "lr": 9.83256646216769e-06, "objective/entropy": 108.31391906738281, "objective/kl": 27.059907913208008, "objective/non_score_reward": -2.705990791320801, "objective/rlhf_reward": -9.090629831949869, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 132.42181396484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7807722687721252, "step": 262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999628067016602 }, { "episode": 4224, "epoch": 0.07592479419060287, "loss/policy_avg": -0.06834838539361954, "lr": 9.831927402862987e-06, "objective/entropy": -89.212890625, "objective/kl": 21.477336883544922, "objective/non_score_reward": -2.147733688354492, "objective/rlhf_reward": -4.190934514999389, "objective/scores": 1.1, "policy/approxkl_avg": 2.770085573196411, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6721138954162598, "step": 263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005064010620117 }, { "episode": 4240, "epoch": 0.07621238810799151, "loss/policy_avg": 0.20960178971290588, "lr": 9.831288343558284e-06, "objective/entropy": 7.579254150390625, "objective/kl": 31.429780960083008, "objective/non_score_reward": -3.1429781913757324, "objective/rlhf_reward": -11.212662303183954, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 39.11629104614258, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7809767723083496, "step": 264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996565580368042 }, { "episode": 4256, "epoch": 0.07649998202538016, "loss/policy_avg": 0.37524640560150146, "lr": 9.830649284253579e-06, "objective/entropy": 211.3717498779297, "objective/kl": 22.981361389160156, "objective/non_score_reward": -2.2981362342834473, "objective/rlhf_reward": -6.792545056343078, "objective/scores": 0.6, "policy/approxkl_avg": 6.7515716552734375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7717372179031372, "step": 265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991950988769531 }, { "episode": 4272, "epoch": 0.0767875759427688, "loss/policy_avg": 1.0095475912094116, "lr": 9.830010224948876e-06, "objective/entropy": -20.248001098632812, "objective/kl": 24.134700775146484, "objective/non_score_reward": -2.4134700298309326, "objective/rlhf_reward": -7.920546785990396, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 72.81602478027344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48144859075546265, "step": 266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9972069263458252 }, { "episode": 4288, "epoch": 0.07707516986015746, "loss/policy_avg": 0.14088629186153412, "lr": 9.829371165644173e-06, "objective/entropy": 199.36297607421875, "objective/kl": 21.469898223876953, "objective/non_score_reward": -2.1469898223876953, "objective/rlhf_reward": -7.031699865069941, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.675331115722656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8196889162063599, "step": 267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000971555709839 }, { "episode": 4304, "epoch": 0.0773627637775461, "loss/policy_avg": 0.7135397791862488, "lr": 9.82873210633947e-06, "objective/entropy": 132.78390502929688, "objective/kl": 29.841154098510742, "objective/non_score_reward": -2.9841156005859375, "objective/rlhf_reward": -10.485864262194976, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 51.49626159667969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6590239405632019, "step": 268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994990825653076 }, { "episode": 4320, "epoch": 0.07765035769493475, "loss/policy_avg": 0.6342403888702393, "lr": 9.828093047034766e-06, "objective/entropy": 68.02133178710938, "objective/kl": 25.947755813598633, "objective/non_score_reward": -2.594775676727295, "objective/rlhf_reward": -9.000500419226986, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 14.322699546813965, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7533285617828369, "step": 269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998431205749512 }, { "episode": 4336, "epoch": 0.0779379516123234, "loss/policy_avg": 1.3432139158248901, "lr": 9.827453987730061e-06, "objective/entropy": -63.51703643798828, "objective/kl": 25.882217407226562, "objective/non_score_reward": -2.588221788406372, "objective/rlhf_reward": -8.230180921331915, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 106.42034912109375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3954962491989136, "step": 270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984210729599 }, { "episode": 4352, "epoch": 0.07822554552971205, "loss/policy_avg": 0.9003316760063171, "lr": 9.826814928425358e-06, "objective/entropy": 303.42669677734375, "objective/kl": 33.25891876220703, "objective/non_score_reward": -3.325892448425293, "objective/rlhf_reward": -11.822616937573315, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 65.77352905273438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7454761266708374, "step": 271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9990100860595703 }, { "episode": 4368, "epoch": 0.0785131394471007, "loss/policy_avg": 1.1572515964508057, "lr": 9.826175869120655e-06, "objective/entropy": -59.230491638183594, "objective/kl": 25.21849250793457, "objective/non_score_reward": -2.5218493938446045, "objective/rlhf_reward": -10.087397575378418, "objective/scores": 0.0, "policy/approxkl_avg": 142.75778198242188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5425001382827759, "step": 272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998133897781372 }, { "episode": 4384, "epoch": 0.07880073336448934, "loss/policy_avg": 0.17176832258701324, "lr": 9.825536809815952e-06, "objective/entropy": 213.77191162109375, "objective/kl": 31.61981773376465, "objective/non_score_reward": -3.1619815826416016, "objective/rlhf_reward": -10.7005155784654, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 128.8477783203125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5492858290672302, "step": 273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9998712539672852 }, { "episode": 4400, "epoch": 0.079088327281878, "loss/policy_avg": -0.22184377908706665, "lr": 9.824897750511249e-06, "objective/entropy": 161.00198364257812, "objective/kl": 34.806671142578125, "objective/non_score_reward": -3.4806675910949707, "objective/rlhf_reward": -11.799963655249151, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 48.8912239074707, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4313841462135315, "step": 274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0014257431030273 }, { "episode": 4416, "epoch": 0.07937592119926663, "loss/policy_avg": 0.590415358543396, "lr": 9.824258691206546e-06, "objective/entropy": -94.14356231689453, "objective/kl": 28.92959976196289, "objective/non_score_reward": -2.8929600715637207, "objective/rlhf_reward": -9.90998030227481, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.575645923614502, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.449258029460907, "step": 275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994723796844482 }, { "episode": 4432, "epoch": 0.07966351511665529, "loss/policy_avg": 0.2740442454814911, "lr": 9.823619631901841e-06, "objective/entropy": 56.66014099121094, "objective/kl": 24.139942169189453, "objective/non_score_reward": -2.413994073867798, "objective/rlhf_reward": -7.533270301596199, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 41.256080627441406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6261377334594727, "step": 276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990897178649902 }, { "episode": 4448, "epoch": 0.07995110903404393, "loss/policy_avg": 0.026854295283555984, "lr": 9.822980572597138e-06, "objective/entropy": 135.07037353515625, "objective/kl": 30.443017959594727, "objective/non_score_reward": -3.044301748275757, "objective/rlhf_reward": -12.177206993103027, "objective/scores": 0.0, "policy/approxkl_avg": 14.024923324584961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5266727209091187, "step": 277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980167150497437 }, { "episode": 4464, "epoch": 0.08023870295143258, "loss/policy_avg": 0.0908375084400177, "lr": 9.822341513292433e-06, "objective/entropy": 98.10940551757812, "objective/kl": 26.351314544677734, "objective/non_score_reward": -2.635131359100342, "objective/rlhf_reward": -9.059573057110667, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 61.92028045654297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5201822519302368, "step": 278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9965415000915527 }, { "episode": 4480, "epoch": 0.08052629686882122, "loss/policy_avg": 0.3492497205734253, "lr": 9.82170245398773e-06, "objective/entropy": 79.57078552246094, "objective/kl": 28.74835205078125, "objective/non_score_reward": -2.8748350143432617, "objective/rlhf_reward": -7.099340653419494, "objective/scores": 1.1, "policy/approxkl_avg": 45.850738525390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7365690469741821, "step": 279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999916553497314 }, { "episode": 4496, "epoch": 0.08081389078620987, "loss/policy_avg": 0.6324511170387268, "lr": 9.821063394683027e-06, "objective/entropy": 116.90592956542969, "objective/kl": 33.273155212402344, "objective/non_score_reward": -3.3273158073425293, "objective/rlhf_reward": -8.90926299095154, "objective/scores": 1.1, "policy/approxkl_avg": 50.5905647277832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5080363750457764, "step": 280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990668296813965 }, { "episode": 4512, "epoch": 0.08110148470359851, "loss/policy_avg": -0.1385992020368576, "lr": 9.820424335378324e-06, "objective/entropy": 72.11842346191406, "objective/kl": 33.207122802734375, "objective/non_score_reward": -3.320712089538574, "objective/rlhf_reward": -13.282849073410034, "objective/scores": 0.0, "policy/approxkl_avg": 60.59511184692383, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8196091651916504, "step": 281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998537302017212 }, { "episode": 4528, "epoch": 0.08138907862098717, "loss/policy_avg": -0.2620585262775421, "lr": 9.81978527607362e-06, "objective/entropy": -5.884607315063477, "objective/kl": 39.53453063964844, "objective/non_score_reward": -3.9534530639648438, "objective/rlhf_reward": -13.691106977240118, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 83.97123718261719, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4607764780521393, "step": 282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000446319580078 }, { "episode": 4544, "epoch": 0.08167667253837581, "loss/policy_avg": 0.8184198141098022, "lr": 9.819146216768916e-06, "objective/entropy": -124.17362976074219, "objective/kl": 30.42546844482422, "objective/non_score_reward": -3.0425467491149902, "objective/rlhf_reward": -10.566067729059775, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 20.279199600219727, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.602076530456543, "step": 283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996403455734253 }, { "episode": 4560, "epoch": 0.08196426645576446, "loss/policy_avg": 0.1789843738079071, "lr": 9.818507157464213e-06, "objective/entropy": 173.48333740234375, "objective/kl": 23.40087890625, "objective/non_score_reward": -2.340087890625, "objective/rlhf_reward": -7.981749632445675, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 17.03640365600586, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6565302014350891, "step": 284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.004666566848755 }, { "episode": 4576, "epoch": 0.0822518603731531, "loss/policy_avg": 1.0035152435302734, "lr": 9.81786809815951e-06, "objective/entropy": 18.757537841796875, "objective/kl": 24.085613250732422, "objective/non_score_reward": -2.4085617065429688, "objective/rlhf_reward": -5.2342465877532955, "objective/scores": 1.1, "policy/approxkl_avg": 54.95973587036133, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5847882032394409, "step": 285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993207454681396 }, { "episode": 4592, "epoch": 0.08253945429054176, "loss/policy_avg": 5.199029922485352, "lr": 9.817229038854806e-06, "objective/entropy": -160.87271118164062, "objective/kl": 20.840656280517578, "objective/non_score_reward": -2.0840654373168945, "objective/rlhf_reward": -5.936261987686157, "objective/scores": 0.6, "policy/approxkl_avg": 9.209554672241211, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6878505945205688, "step": 286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999942779541016 }, { "episode": 4608, "epoch": 0.0828270482079304, "loss/policy_avg": 1.134081244468689, "lr": 9.816589979550103e-06, "objective/entropy": 120.20220947265625, "objective/kl": 32.1230583190918, "objective/non_score_reward": -3.212306499481201, "objective/rlhf_reward": -11.449225521087648, "objective/scores": 0.35, "policy/approxkl_avg": 37.81696319580078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8285540342330933, "step": 287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991226196289062 }, { "episode": 4624, "epoch": 0.08311464212531905, "loss/policy_avg": 0.17092914879322052, "lr": 9.8159509202454e-06, "objective/entropy": 6.329719543457031, "objective/kl": 29.584348678588867, "objective/non_score_reward": -2.95843505859375, "objective/rlhf_reward": -10.171880011976349, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 29.629112243652344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5302486419677734, "step": 288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0020604133605957 }, { "episode": 4640, "epoch": 0.0834022360427077, "loss/policy_avg": 0.17788568139076233, "lr": 9.815311860940695e-06, "objective/entropy": 21.96484375, "objective/kl": 28.446231842041016, "objective/non_score_reward": -2.84462308883667, "objective/rlhf_reward": -9.431081603245671, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 137.49514770507812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6393083333969116, "step": 289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981507062911987 }, { "episode": 4656, "epoch": 0.08368982996009634, "loss/policy_avg": 0.4766189754009247, "lr": 9.814672801635992e-06, "objective/entropy": 87.13041687011719, "objective/kl": 26.18436050415039, "objective/non_score_reward": -2.618436098098755, "objective/rlhf_reward": -9.095142700759274, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 64.16291809082031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4916858971118927, "step": 290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992941617965698 }, { "episode": 4672, "epoch": 0.083977423877485, "loss/policy_avg": 7.575510025024414, "lr": 9.81403374233129e-06, "objective/entropy": -187.93580627441406, "objective/kl": 21.01421356201172, "objective/non_score_reward": -2.101421356201172, "objective/rlhf_reward": -6.9550868078187555, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 58.152530670166016, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.793678343296051, "step": 291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9997904300689697 }, { "episode": 4688, "epoch": 0.08426501779487364, "loss/policy_avg": 0.3069241940975189, "lr": 9.813394683026586e-06, "objective/entropy": 95.74089050292969, "objective/kl": 22.938138961791992, "objective/non_score_reward": -2.293813943862915, "objective/rlhf_reward": -7.052549543158088, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 9.722650527954102, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5521177053451538, "step": 292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9989383220672607 }, { "episode": 4704, "epoch": 0.08455261171226229, "loss/policy_avg": 0.8028863072395325, "lr": 9.812755623721883e-06, "objective/entropy": 225.46250915527344, "objective/kl": 32.304569244384766, "objective/non_score_reward": -3.230457305908203, "objective/rlhf_reward": -11.18849541346232, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 103.39628601074219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5377808809280396, "step": 293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9996423721313477 }, { "episode": 4720, "epoch": 0.08484020562965093, "loss/policy_avg": 0.5835884809494019, "lr": 9.81211656441718e-06, "objective/entropy": 75.27652740478516, "objective/kl": 30.011789321899414, "objective/non_score_reward": -3.0011792182922363, "objective/rlhf_reward": -9.882009925619636, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 10.76335334777832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7725957632064819, "step": 294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982374906539917 }, { "episode": 4736, "epoch": 0.08512779954703958, "loss/policy_avg": 0.17510247230529785, "lr": 9.811477505112475e-06, "objective/entropy": 153.28558349609375, "objective/kl": 35.96855926513672, "objective/non_score_reward": -3.596856117248535, "objective/rlhf_reward": -12.906472566540598, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 19.366321563720703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6706559658050537, "step": 295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993175268173218 }, { "episode": 4752, "epoch": 0.08541539346442822, "loss/policy_avg": 0.4794872999191284, "lr": 9.810838445807772e-06, "objective/entropy": 254.9187469482422, "objective/kl": 34.023677825927734, "objective/non_score_reward": -3.4023680686950684, "objective/rlhf_reward": -12.158873777003631, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 62.12803268432617, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6503519415855408, "step": 296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9984500408172607 }, { "episode": 4768, "epoch": 0.08570298738181688, "loss/policy_avg": 1.1904816627502441, "lr": 9.810199386503069e-06, "objective/entropy": 146.021484375, "objective/kl": 35.92856216430664, "objective/non_score_reward": -3.5928561687469482, "objective/rlhf_reward": -12.424013684468207, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 37.72700500488281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6158914566040039, "step": 297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981852769851685 }, { "episode": 4784, "epoch": 0.08599058129920552, "loss/policy_avg": 0.000278279185295105, "lr": 9.809560327198366e-06, "objective/entropy": 178.57492065429688, "objective/kl": 34.800636291503906, "objective/non_score_reward": -3.4800639152526855, "objective/rlhf_reward": -10.99653712356207, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 41.639854431152344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7849889993667603, "step": 298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984562397003174 }, { "episode": 4800, "epoch": 0.08627817521659417, "loss/policy_avg": 0.7629772424697876, "lr": 9.808921267893663e-06, "objective/entropy": -145.59861755371094, "objective/kl": 28.413082122802734, "objective/non_score_reward": -2.841308116912842, "objective/rlhf_reward": -6.965232110023498, "objective/scores": 1.1, "policy/approxkl_avg": 13.004857063293457, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.684173047542572, "step": 299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994359016418457 }, { "episode": 4816, "epoch": 0.08656576913398281, "loss/policy_avg": 1.7354516983032227, "lr": 9.808282208588958e-06, "objective/entropy": 272.84912109375, "objective/kl": 26.817108154296875, "objective/non_score_reward": -2.681710720062256, "objective/rlhf_reward": -9.211070620807346, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 35.25104904174805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8183693885803223, "step": 300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998483657836914 }, { "episode": 4832, "epoch": 0.08685336305137147, "loss/policy_avg": 0.06534934043884277, "lr": 9.807643149284255e-06, "objective/entropy": 152.22633361816406, "objective/kl": 30.80361557006836, "objective/non_score_reward": -3.0803616046905518, "objective/rlhf_reward": -12.321446180343628, "objective/scores": 0.0, "policy/approxkl_avg": 187.40298461914062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.619062066078186, "step": 301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991655349731445 }, { "episode": 4848, "epoch": 0.0871409569687601, "loss/policy_avg": 1.8463071584701538, "lr": 9.80700408997955e-06, "objective/entropy": -59.8196907043457, "objective/kl": 31.326427459716797, "objective/non_score_reward": -3.132642984390259, "objective/rlhf_reward": -11.014799916537937, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 56.62882995605469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6038594245910645, "step": 302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993600845336914 }, { "episode": 4864, "epoch": 0.08742855088614876, "loss/policy_avg": 0.08039037883281708, "lr": 9.806365030674847e-06, "objective/entropy": 40.064144134521484, "objective/kl": 22.286996841430664, "objective/non_score_reward": -2.2286999225616455, "objective/rlhf_reward": -7.514799362421035, "objective/scores": 0.35, "policy/approxkl_avg": 38.59841537475586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49097996950149536, "step": 303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9966613054275513 }, { "episode": 4880, "epoch": 0.0877161448035374, "loss/policy_avg": 0.01872839219868183, "lr": 9.805725971370144e-06, "objective/entropy": 58.7380256652832, "objective/kl": 28.672008514404297, "objective/non_score_reward": -2.8672008514404297, "objective/rlhf_reward": -9.735470251242319, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 149.07861328125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6022211313247681, "step": 304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0004193782806396 }, { "episode": 4896, "epoch": 0.08800373872092605, "loss/policy_avg": 0.5821743011474609, "lr": 9.80508691206544e-06, "objective/entropy": -12.124443054199219, "objective/kl": 24.10376739501953, "objective/non_score_reward": -2.410377025604248, "objective/rlhf_reward": -5.241507506370544, "objective/scores": 1.1, "policy/approxkl_avg": 3.3420569896698, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6286916732788086, "step": 305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002763509750366 }, { "episode": 4912, "epoch": 0.08829133263831469, "loss/policy_avg": 0.32468903064727783, "lr": 9.804447852760737e-06, "objective/entropy": -245.09518432617188, "objective/kl": 25.548696517944336, "objective/non_score_reward": -2.5548696517944336, "objective/rlhf_reward": -10.219478368759155, "objective/scores": 0.0, "policy/approxkl_avg": 18.726303100585938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.633787989616394, "step": 306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0003480911254883 }, { "episode": 4928, "epoch": 0.08857892655570335, "loss/policy_avg": 0.5798380970954895, "lr": 9.803808793456034e-06, "objective/entropy": 91.35831451416016, "objective/kl": 35.70774459838867, "objective/non_score_reward": -3.570774555206299, "objective/rlhf_reward": -14.283098220825195, "objective/scores": 0.0, "policy/approxkl_avg": 44.0499267578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4979282021522522, "step": 307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003609657287598 }, { "episode": 4944, "epoch": 0.088866520473092, "loss/policy_avg": 0.36592239141464233, "lr": 9.80316973415133e-06, "objective/entropy": 39.27040100097656, "objective/kl": 30.252880096435547, "objective/non_score_reward": -3.025287628173828, "objective/rlhf_reward": -10.775638136893434, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.1499075889587402, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6890300512313843, "step": 308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988046884536743 }, { "episode": 4960, "epoch": 0.08915411439048064, "loss/policy_avg": 0.08172816783189774, "lr": 9.802530674846626e-06, "objective/entropy": -196.7550811767578, "objective/kl": 30.32009506225586, "objective/non_score_reward": -3.0320096015930176, "objective/rlhf_reward": -9.204319153667662, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 8.101791381835938, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5759010910987854, "step": 309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000977516174316 }, { "episode": 4976, "epoch": 0.0894417083078693, "loss/policy_avg": 0.5907818078994751, "lr": 9.801891615541923e-06, "objective/entropy": -3.5698318481445312, "objective/kl": 28.213176727294922, "objective/non_score_reward": -2.8213181495666504, "objective/rlhf_reward": -9.72901317378576, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 56.35433578491211, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6199610233306885, "step": 310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979722499847412 }, { "episode": 4992, "epoch": 0.08972930222525793, "loss/policy_avg": 0.39707911014556885, "lr": 9.80125255623722e-06, "objective/entropy": -11.338485717773438, "objective/kl": 24.322521209716797, "objective/non_score_reward": -2.4322521686553955, "objective/rlhf_reward": -8.350406386939389, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 14.5820951461792, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6451054811477661, "step": 311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0019586086273193 }, { "episode": 5008, "epoch": 0.09001689614264659, "loss/policy_avg": -0.07866669446229935, "lr": 9.800613496932517e-06, "objective/entropy": 170.05404663085938, "objective/kl": 28.295799255371094, "objective/non_score_reward": -2.8295798301696777, "objective/rlhf_reward": -9.894486983020869, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 40.782066345214844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6188048124313354, "step": 312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0017778873443604 }, { "episode": 5024, "epoch": 0.09030449006003523, "loss/policy_avg": -0.23688295483589172, "lr": 9.799974437627812e-06, "objective/entropy": 156.63333129882812, "objective/kl": 27.922500610351562, "objective/non_score_reward": -2.792250156402588, "objective/rlhf_reward": -9.718402723880157, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 22.294483184814453, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4276520609855652, "step": 313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0026590824127197 }, { "episode": 5040, "epoch": 0.09059208397742388, "loss/policy_avg": 0.09796786308288574, "lr": 9.799335378323109e-06, "objective/entropy": -10.673637390136719, "objective/kl": 20.40918731689453, "objective/non_score_reward": -2.0409185886383057, "objective/rlhf_reward": -8.163674473762512, "objective/scores": 0.0, "policy/approxkl_avg": 8.275084495544434, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5387430787086487, "step": 314, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982633590698242 }, { "episode": 5056, "epoch": 0.09087967789481252, "loss/policy_avg": 0.17557716369628906, "lr": 9.798696319018406e-06, "objective/entropy": 20.533397674560547, "objective/kl": 33.14729309082031, "objective/non_score_reward": -3.3147292137145996, "objective/rlhf_reward": -11.702658264842583, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 58.23655700683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6181402206420898, "step": 315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998816728591919 }, { "episode": 5072, "epoch": 0.09116727181220118, "loss/policy_avg": 0.28663304448127747, "lr": 9.798057259713703e-06, "objective/entropy": 110.77783203125, "objective/kl": 24.706939697265625, "objective/non_score_reward": -2.470694065093994, "objective/rlhf_reward": -8.278656039301472, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 21.429655075073242, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6165672540664673, "step": 316, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001864433288574 }, { "episode": 5088, "epoch": 0.09145486572958982, "loss/policy_avg": 0.0841158926486969, "lr": 9.797418200409e-06, "objective/entropy": 64.50070190429688, "objective/kl": 32.75787353515625, "objective/non_score_reward": -3.275787353515625, "objective/rlhf_reward": -11.499028954569418, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 102.43559265136719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6068868637084961, "step": 317, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997694492340088 }, { "episode": 5104, "epoch": 0.09174245964697847, "loss/policy_avg": 0.35147473216056824, "lr": 9.796779141104296e-06, "objective/entropy": 208.5213623046875, "objective/kl": 31.126712799072266, "objective/non_score_reward": -3.112671375274658, "objective/rlhf_reward": -11.10904937079492, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 146.6444091796875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7985448837280273, "step": 318, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9988431930541992 }, { "episode": 5120, "epoch": 0.09203005356436711, "loss/policy_avg": 0.19098839163780212, "lr": 9.796140081799592e-06, "objective/entropy": -30.1602783203125, "objective/kl": 31.919559478759766, "objective/non_score_reward": -3.191955804824829, "objective/rlhf_reward": -11.36782262325287, "objective/scores": 0.35, "policy/approxkl_avg": 31.554279327392578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7731765508651733, "step": 319, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0018060207366943 }, { "episode": 5136, "epoch": 0.09231764748175576, "loss/policy_avg": 0.05387501046061516, "lr": 9.795501022494888e-06, "objective/entropy": 109.4754638671875, "objective/kl": 32.21202850341797, "objective/non_score_reward": -3.2212026119232178, "objective/rlhf_reward": -11.434212188334808, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 7.5359039306640625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5108368396759033, "step": 320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981523752212524 }, { "episode": 5152, "epoch": 0.0926052413991444, "loss/policy_avg": 0.5724260210990906, "lr": 9.794861963190185e-06, "objective/entropy": 62.85846710205078, "objective/kl": 30.164125442504883, "objective/non_score_reward": -3.0164127349853516, "objective/rlhf_reward": -10.332317308584848, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 44.99430465698242, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7216867208480835, "step": 321, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000819683074951 }, { "episode": 5168, "epoch": 0.09289283531653306, "loss/policy_avg": 0.23510941863059998, "lr": 9.794222903885482e-06, "objective/entropy": 88.79434204101562, "objective/kl": 33.60057830810547, "objective/non_score_reward": -3.360057830810547, "objective/rlhf_reward": -11.04023096561432, "objective/scores": 0.6, "policy/approxkl_avg": 60.40937805175781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6445315480232239, "step": 322, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001824378967285 }, { "episode": 5184, "epoch": 0.0931804292339217, "loss/policy_avg": 0.10644792020320892, "lr": 9.793583844580777e-06, "objective/entropy": 73.26347351074219, "objective/kl": 32.69441223144531, "objective/non_score_reward": -3.2694411277770996, "objective/rlhf_reward": -11.521504967418268, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 11.021139144897461, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5057616829872131, "step": 323, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983140230178833 }, { "episode": 5200, "epoch": 0.09346802315131035, "loss/policy_avg": 0.7994442582130432, "lr": 9.792944785276074e-06, "objective/entropy": 88.5349349975586, "objective/kl": 25.706418991088867, "objective/non_score_reward": -2.5706419944763184, "objective/rlhf_reward": -8.62070823234378, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 55.555015563964844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.554456889629364, "step": 324, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997270107269287 }, { "episode": 5216, "epoch": 0.09375561706869899, "loss/policy_avg": 0.5393191576004028, "lr": 9.792305725971371e-06, "objective/entropy": 74.77957153320312, "objective/kl": 36.75124740600586, "objective/non_score_reward": -3.6751246452331543, "objective/rlhf_reward": -12.967165247599283, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 22.211036682128906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47837570309638977, "step": 325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0005834102630615 }, { "episode": 5232, "epoch": 0.09404321098608764, "loss/policy_avg": 0.5926495790481567, "lr": 9.791666666666666e-06, "objective/entropy": 94.69478607177734, "objective/kl": 32.18170166015625, "objective/non_score_reward": -3.218170166015625, "objective/rlhf_reward": -11.531045725851683, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 12.2184419631958, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9364026188850403, "step": 326, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986941814422607 }, { "episode": 5248, "epoch": 0.0943308049034763, "loss/policy_avg": 8.741055488586426, "lr": 9.791027607361963e-06, "objective/entropy": 13.209190368652344, "objective/kl": 46.40322494506836, "objective/non_score_reward": -4.640322208404541, "objective/rlhf_reward": -17.21965341857019, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 196.84405517578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6481121182441711, "step": 327, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997447729110718 }, { "episode": 5264, "epoch": 0.09461839882086494, "loss/policy_avg": -0.0158542487770319, "lr": 9.79038854805726e-06, "objective/entropy": -67.68810272216797, "objective/kl": 25.325042724609375, "objective/non_score_reward": -2.5325045585632324, "objective/rlhf_reward": -8.573758213725641, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.798250198364258, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7733464241027832, "step": 328, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998700499534607 }, { "episode": 5280, "epoch": 0.09490599273825359, "loss/policy_avg": 0.06980250030755997, "lr": 9.789749488752557e-06, "objective/entropy": 66.16055297851562, "objective/kl": 28.001384735107422, "objective/non_score_reward": -2.8001387119293213, "objective/rlhf_reward": -9.841305100654049, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 26.662395477294922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4329299330711365, "step": 329, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9985547065734863 }, { "episode": 5296, "epoch": 0.09519358665564223, "loss/policy_avg": 1.1175042390823364, "lr": 9.789110429447854e-06, "objective/entropy": 198.39385986328125, "objective/kl": 35.409645080566406, "objective/non_score_reward": -3.5409646034240723, "objective/rlhf_reward": -12.041152181402715, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 80.42436218261719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4869406819343567, "step": 330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000159502029419 }, { "episode": 5312, "epoch": 0.09548118057303089, "loss/policy_avg": 0.2751445472240448, "lr": 9.78847137014315e-06, "objective/entropy": 171.96897888183594, "objective/kl": 39.34714889526367, "objective/non_score_reward": -3.9347147941589355, "objective/rlhf_reward": -13.914030189785073, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 72.23497009277344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49671417474746704, "step": 331, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995276927947998 }, { "episode": 5328, "epoch": 0.09576877449041953, "loss/policy_avg": 0.7539587616920471, "lr": 9.787832310838446e-06, "objective/entropy": 8.914024353027344, "objective/kl": 21.132511138916016, "objective/non_score_reward": -2.113251209259033, "objective/rlhf_reward": -6.848884735170918, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 55.41283416748047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5207578539848328, "step": 332, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999730110168457 }, { "episode": 5344, "epoch": 0.09605636840780818, "loss/policy_avg": 0.08111564069986343, "lr": 9.787193251533743e-06, "objective/entropy": -32.56279754638672, "objective/kl": 26.932476043701172, "objective/non_score_reward": -2.6932475566864014, "objective/rlhf_reward": -9.447477314501924, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 31.9769344329834, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5189494490623474, "step": 333, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998490810394287 }, { "episode": 5360, "epoch": 0.09634396232519682, "loss/policy_avg": 0.12806567549705505, "lr": 9.78655419222904e-06, "objective/entropy": -60.638038635253906, "objective/kl": 33.80628204345703, "objective/non_score_reward": -3.3806281089782715, "objective/rlhf_reward": -12.006740414889988, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 97.76350402832031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5655949115753174, "step": 334, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991028308868408 }, { "episode": 5376, "epoch": 0.09663155624258547, "loss/policy_avg": 0.4162527918815613, "lr": 9.785915132924337e-06, "objective/entropy": 73.74658203125, "objective/kl": 28.956912994384766, "objective/non_score_reward": -2.895691394805908, "objective/rlhf_reward": -9.757937069210122, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 12.659797668457031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6926892995834351, "step": 335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990322589874268 }, { "episode": 5392, "epoch": 0.09691915015997411, "loss/policy_avg": 0.6766362190246582, "lr": 9.785276073619633e-06, "objective/entropy": -167.6099090576172, "objective/kl": 33.4842414855957, "objective/non_score_reward": -3.3484244346618652, "objective/rlhf_reward": -10.46997794950125, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 45.80317687988281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6405798196792603, "step": 336, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999110221862793 }, { "episode": 5408, "epoch": 0.09720674407736277, "loss/policy_avg": 0.7705954909324646, "lr": 9.784637014314929e-06, "objective/entropy": 189.44476318359375, "objective/kl": 40.57612991333008, "objective/non_score_reward": -4.057613372802734, "objective/rlhf_reward": -14.283041070179877, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 47.49778747558594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7007959485054016, "step": 337, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9955544471740723 }, { "episode": 5424, "epoch": 0.0974943379947514, "loss/policy_avg": 0.8678327798843384, "lr": 9.783997955010226e-06, "objective/entropy": 138.7545166015625, "objective/kl": 43.06449890136719, "objective/non_score_reward": -4.306450366973877, "objective/rlhf_reward": -15.775203089328155, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 187.52108764648438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6742819547653198, "step": 338, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000286102294922 }, { "episode": 5440, "epoch": 0.09778193191214006, "loss/policy_avg": 0.13020552694797516, "lr": 9.783358895705522e-06, "objective/entropy": -34.55393981933594, "objective/kl": 27.52876091003418, "objective/non_score_reward": -2.7528762817382812, "objective/rlhf_reward": -9.530552032406687, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.538684844970703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.45005565881729126, "step": 339, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000762462615967 }, { "episode": 5456, "epoch": 0.0980695258295287, "loss/policy_avg": 0.8651669025421143, "lr": 9.78271983640082e-06, "objective/entropy": 184.3627471923828, "objective/kl": 31.240346908569336, "objective/non_score_reward": -3.124034881591797, "objective/rlhf_reward": -12.49613881111145, "objective/scores": 0.0, "policy/approxkl_avg": 16.510074615478516, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4378349781036377, "step": 340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998918056488037 }, { "episode": 5472, "epoch": 0.09835711974691735, "loss/policy_avg": 0.13001634180545807, "lr": 9.782080777096116e-06, "objective/entropy": 123.772705078125, "objective/kl": 33.01024627685547, "objective/non_score_reward": -3.3010246753692627, "objective/rlhf_reward": -8.804098105430603, "objective/scores": 1.1, "policy/approxkl_avg": 55.6832275390625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7997548580169678, "step": 341, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001884937286377 }, { "episode": 5488, "epoch": 0.098644713664306, "loss/policy_avg": 0.538088321685791, "lr": 9.781441717791413e-06, "objective/entropy": 208.26202392578125, "objective/kl": 28.19437026977539, "objective/non_score_reward": -2.819437026977539, "objective/rlhf_reward": -9.33033687897199, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 56.31122589111328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.775277316570282, "step": 342, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9953358173370361 }, { "episode": 5504, "epoch": 0.09893230758169465, "loss/policy_avg": -0.014354228973388672, "lr": 9.780802658486708e-06, "objective/entropy": -60.35287857055664, "objective/kl": 25.630271911621094, "objective/non_score_reward": -2.5630269050598145, "objective/rlhf_reward": -8.926595482855959, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.3227713108062744, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4413827657699585, "step": 343, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0038928985595703 }, { "episode": 5520, "epoch": 0.09921990149908329, "loss/policy_avg": 0.27923208475112915, "lr": 9.780163599182005e-06, "objective/entropy": -24.742401123046875, "objective/kl": 31.480648040771484, "objective/non_score_reward": -3.1480648517608643, "objective/rlhf_reward": -11.233009540770931, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 13.539884567260742, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5237823724746704, "step": 344, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998574256896973 }, { "episode": 5536, "epoch": 0.09950749541647194, "loss/policy_avg": 0.1885061115026474, "lr": 9.7795245398773e-06, "objective/entropy": 182.22181701660156, "objective/kl": 29.661117553710938, "objective/non_score_reward": -2.966111660003662, "objective/rlhf_reward": -10.522810748129515, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.2202861309051514, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6761025786399841, "step": 345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999419927597046 }, { "episode": 5552, "epoch": 0.0997950893338606, "loss/policy_avg": 0.7343586087226868, "lr": 9.778885480572597e-06, "objective/entropy": 145.13526916503906, "objective/kl": 45.35038375854492, "objective/non_score_reward": -4.535038471221924, "objective/rlhf_reward": -16.315324659618447, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 33.988563537597656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7404603958129883, "step": 346, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995590329170227 }, { "episode": 5568, "epoch": 0.10008268325124924, "loss/policy_avg": 0.6405590772628784, "lr": 9.778246421267894e-06, "objective/entropy": 162.7369842529297, "objective/kl": 37.150367736816406, "objective/non_score_reward": -3.7150371074676514, "objective/rlhf_reward": -13.379196050579905, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 36.95792770385742, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8137757778167725, "step": 347, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9953217506408691 }, { "episode": 5584, "epoch": 0.10037027716863789, "loss/policy_avg": 0.13212129473686218, "lr": 9.777607361963191e-06, "objective/entropy": 206.94252014160156, "objective/kl": 34.0411262512207, "objective/non_score_reward": -3.4041128158569336, "objective/rlhf_reward": -11.216450786590576, "objective/scores": 0.6, "policy/approxkl_avg": 133.2515869140625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7521044015884399, "step": 348, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9959970712661743 }, { "episode": 5600, "epoch": 0.10065787108602653, "loss/policy_avg": 0.9090590476989746, "lr": 9.776968302658488e-06, "objective/entropy": 55.456298828125, "objective/kl": 24.91229248046875, "objective/non_score_reward": -2.4912290573120117, "objective/rlhf_reward": -8.140087719234536, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 29.07049560546875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8177493810653687, "step": 349, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979220628738403 }, { "episode": 5616, "epoch": 0.10094546500341518, "loss/policy_avg": 0.46943986415863037, "lr": 9.776329243353783e-06, "objective/entropy": 153.11770629882812, "objective/kl": 31.714759826660156, "objective/non_score_reward": -3.171476125717163, "objective/rlhf_reward": -9.762185250164244, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 49.198020935058594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3995407819747925, "step": 350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9990980625152588 }, { "episode": 5632, "epoch": 0.10123305892080382, "loss/policy_avg": 0.12656962871551514, "lr": 9.77569018404908e-06, "objective/entropy": 109.22264862060547, "objective/kl": 28.461389541625977, "objective/non_score_reward": -2.8461389541625977, "objective/rlhf_reward": -8.46083704078314, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 40.512847900390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.588497519493103, "step": 351, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977093935012817 }, { "episode": 5648, "epoch": 0.10152065283819248, "loss/policy_avg": 0.7170840501785278, "lr": 9.775051124744377e-06, "objective/entropy": 14.107101440429688, "objective/kl": 41.7979736328125, "objective/non_score_reward": -4.179797172546387, "objective/rlhf_reward": -15.377552798300414, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 11.696022987365723, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7339128255844116, "step": 352, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979870319366455 }, { "episode": 5664, "epoch": 0.10180824675558112, "loss/policy_avg": 0.8306883573532104, "lr": 9.774412065439674e-06, "objective/entropy": -67.41658782958984, "objective/kl": 26.34395408630371, "objective/non_score_reward": -2.6343955993652344, "objective/rlhf_reward": -8.137582039833068, "objective/scores": 0.6, "policy/approxkl_avg": 115.25839233398438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6014617681503296, "step": 353, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9970800876617432 }, { "episode": 5680, "epoch": 0.10209584067296977, "loss/policy_avg": 2.176168918609619, "lr": 9.77377300613497e-06, "objective/entropy": 134.90728759765625, "objective/kl": 31.819995880126953, "objective/non_score_reward": -3.181999683380127, "objective/rlhf_reward": -12.727998733520508, "objective/scores": 0.0, "policy/approxkl_avg": 26.059894561767578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5675879716873169, "step": 354, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992128610610962 }, { "episode": 5696, "epoch": 0.10238343459035841, "loss/policy_avg": 0.9548969268798828, "lr": 9.773133946830267e-06, "objective/entropy": -45.11736297607422, "objective/kl": 30.003692626953125, "objective/non_score_reward": -3.000369071960449, "objective/rlhf_reward": -10.659840753584533, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 80.65755462646484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6548440456390381, "step": 355, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980876445770264 }, { "episode": 5712, "epoch": 0.10267102850774706, "loss/policy_avg": -0.09791913628578186, "lr": 9.772494887525563e-06, "objective/entropy": 59.10938262939453, "objective/kl": 24.62106704711914, "objective/non_score_reward": -2.462106943130493, "objective/rlhf_reward": -8.489177667830868, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 34.42068099975586, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7636164426803589, "step": 356, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001443862915039 }, { "episode": 5728, "epoch": 0.1029586224251357, "loss/policy_avg": 0.6120666265487671, "lr": 9.77185582822086e-06, "objective/entropy": 222.30874633789062, "objective/kl": 32.64442825317383, "objective/non_score_reward": -3.2644426822662354, "objective/rlhf_reward": -11.576818349774243, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.193035125732422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5914427042007446, "step": 357, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.002417802810669 }, { "episode": 5744, "epoch": 0.10324621634252436, "loss/policy_avg": 0.25659894943237305, "lr": 9.771216768916156e-06, "objective/entropy": 173.52723693847656, "objective/kl": 29.877527236938477, "objective/non_score_reward": -2.987752914428711, "objective/rlhf_reward": -7.551011657714843, "objective/scores": 1.1, "policy/approxkl_avg": 18.964191436767578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7272264361381531, "step": 358, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9968700408935547 }, { "episode": 5760, "epoch": 0.103533810259913, "loss/policy_avg": 0.4551319479942322, "lr": 9.770577709611453e-06, "objective/entropy": 66.63546752929688, "objective/kl": 29.777273178100586, "objective/non_score_reward": -2.9777274131774902, "objective/rlhf_reward": -9.510909175872802, "objective/scores": 0.6, "policy/approxkl_avg": 110.96263885498047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.554497241973877, "step": 359, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9965577125549316 }, { "episode": 5776, "epoch": 0.10382140417730165, "loss/policy_avg": 1.5252394676208496, "lr": 9.76993865030675e-06, "objective/entropy": -97.26277923583984, "objective/kl": 33.4285888671875, "objective/non_score_reward": -3.3428590297698975, "objective/rlhf_reward": -11.890483262951733, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 8.201589584350586, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4571065902709961, "step": 360, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999207258224487 }, { "episode": 5792, "epoch": 0.10410899809469029, "loss/policy_avg": 1.1003179550170898, "lr": 9.769299591002045e-06, "objective/entropy": 51.82417297363281, "objective/kl": 34.724029541015625, "objective/non_score_reward": -3.472402811050415, "objective/rlhf_reward": -12.33335193893011, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 8.241430282592773, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7186964750289917, "step": 361, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989242553710938 }, { "episode": 5808, "epoch": 0.10439659201207895, "loss/policy_avg": 0.40074190497398376, "lr": 9.768660531697342e-06, "objective/entropy": 219.26010131835938, "objective/kl": 36.2478141784668, "objective/non_score_reward": -3.6247811317443848, "objective/rlhf_reward": -12.895005021158774, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.6230387687683105, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5367094278335571, "step": 362, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9987454414367676 }, { "episode": 5824, "epoch": 0.10468418592946759, "loss/policy_avg": 0.9861453771591187, "lr": 9.768021472392639e-06, "objective/entropy": -9.609394073486328, "objective/kl": 39.06307601928711, "objective/non_score_reward": -3.9063076972961426, "objective/rlhf_reward": -14.109459364207918, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 78.4552993774414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6734092235565186, "step": 363, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9968814849853516 }, { "episode": 5840, "epoch": 0.10497177984685624, "loss/policy_avg": 0.18136531114578247, "lr": 9.767382413087936e-06, "objective/entropy": 78.3685073852539, "objective/kl": 38.321044921875, "objective/non_score_reward": -3.8321046829223633, "objective/rlhf_reward": -13.503589506420205, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 127.91275787353516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46624571084976196, "step": 364, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0008468627929688 }, { "episode": 5856, "epoch": 0.10525937376424489, "loss/policy_avg": -0.3799706697463989, "lr": 9.766743353783233e-06, "objective/entropy": 138.2041473388672, "objective/kl": 46.876441955566406, "objective/non_score_reward": -4.6876444816589355, "objective/rlhf_reward": -14.350577926635744, "objective/scores": 1.1, "policy/approxkl_avg": 66.94557189941406, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5722706317901611, "step": 365, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999732971191406 }, { "episode": 5872, "epoch": 0.10554696768163353, "loss/policy_avg": 0.034319084137678146, "lr": 9.76610429447853e-06, "objective/entropy": 45.21516418457031, "objective/kl": 30.351581573486328, "objective/non_score_reward": -3.035158157348633, "objective/rlhf_reward": -9.21691409194586, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.7516632080078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4482240676879883, "step": 366, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992876052856445 }, { "episode": 5888, "epoch": 0.10583456159902219, "loss/policy_avg": 0.8865995407104492, "lr": 9.765465235173825e-06, "objective/entropy": -21.33509063720703, "objective/kl": 35.2110595703125, "objective/non_score_reward": -3.5211057662963867, "objective/rlhf_reward": -12.35109020868937, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 39.482017517089844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6004269123077393, "step": 367, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980597496032715 }, { "episode": 5904, "epoch": 0.10612215551641083, "loss/policy_avg": 0.14120006561279297, "lr": 9.764826175869122e-06, "objective/entropy": 314.3269348144531, "objective/kl": 33.36817932128906, "objective/non_score_reward": -3.336818218231201, "objective/rlhf_reward": -11.68541360420047, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 111.91177368164062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7748069763183594, "step": 368, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0023789405822754 }, { "episode": 5920, "epoch": 0.10640974943379948, "loss/policy_avg": 1.3205476999282837, "lr": 9.764187116564417e-06, "objective/entropy": -41.12682342529297, "objective/kl": 31.178136825561523, "objective/non_score_reward": -3.1178135871887207, "objective/rlhf_reward": -10.990302207882763, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 50.676719665527344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9075043797492981, "step": 369, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977303743362427 }, { "episode": 5936, "epoch": 0.10669734335118812, "loss/policy_avg": 0.4172307848930359, "lr": 9.763548057259714e-06, "objective/entropy": 151.11341857910156, "objective/kl": 29.471710205078125, "objective/non_score_reward": -2.947171211242676, "objective/rlhf_reward": -10.410082795707089, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 31.20602035522461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6370272636413574, "step": 370, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985219240188599 }, { "episode": 5952, "epoch": 0.10698493726857677, "loss/policy_avg": -0.09500053524971008, "lr": 9.76290899795501e-06, "objective/entropy": -34.93052673339844, "objective/kl": 32.19451904296875, "objective/non_score_reward": -3.219452142715454, "objective/rlhf_reward": -11.273688945833761, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 64.82252502441406, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7023112773895264, "step": 371, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001070499420166 }, { "episode": 5968, "epoch": 0.10727253118596541, "loss/policy_avg": 0.6650490760803223, "lr": 9.762269938650308e-06, "objective/entropy": -44.10865783691406, "objective/kl": 27.115589141845703, "objective/non_score_reward": -2.7115590572357178, "objective/rlhf_reward": -9.021407242092202, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 34.185760498046875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7515213489532471, "step": 372, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9973523616790771 }, { "episode": 5984, "epoch": 0.10756012510335407, "loss/policy_avg": 0.7072340846061707, "lr": 9.761630879345604e-06, "objective/entropy": 4.434268951416016, "objective/kl": 43.21569061279297, "objective/non_score_reward": -4.321569442749023, "objective/rlhf_reward": -15.770505034717257, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.586810350418091, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6180188655853271, "step": 373, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001654624938965 }, { "episode": 6000, "epoch": 0.10784771902074271, "loss/policy_avg": 1.28859281539917, "lr": 9.7609918200409e-06, "objective/entropy": -139.96766662597656, "objective/kl": 30.635095596313477, "objective/non_score_reward": -3.063509464263916, "objective/rlhf_reward": -7.854037737846375, "objective/scores": 1.1, "policy/approxkl_avg": 153.5921630859375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7665011882781982, "step": 374, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9986308813095093 }, { "episode": 6016, "epoch": 0.10813531293813136, "loss/policy_avg": 1.1559712886810303, "lr": 9.760352760736196e-06, "objective/entropy": 112.28376007080078, "objective/kl": 48.56169891357422, "objective/non_score_reward": -4.856169700622559, "objective/rlhf_reward": -18.00084741850671, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 29.986862182617188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.86993408203125, "step": 375, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9950189590454102 }, { "episode": 6032, "epoch": 0.10842290685552, "loss/policy_avg": 0.43735095858573914, "lr": 9.759713701431493e-06, "objective/entropy": 161.14744567871094, "objective/kl": 20.346540451049805, "objective/non_score_reward": -2.034654140472412, "objective/rlhf_reward": -6.657663944180369, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 8.951998710632324, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7470377683639526, "step": 376, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000926971435547 }, { "episode": 6048, "epoch": 0.10871050077290866, "loss/policy_avg": 0.25953274965286255, "lr": 9.75907464212679e-06, "objective/entropy": -127.31167602539062, "objective/kl": 32.83821105957031, "objective/non_score_reward": -3.283820867538452, "objective/rlhf_reward": -13.135283589363098, "objective/scores": 0.0, "policy/approxkl_avg": 43.502960205078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7563947439193726, "step": 377, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980974197387695 }, { "episode": 6064, "epoch": 0.1089980946902973, "loss/policy_avg": 1.1847639083862305, "lr": 9.758435582822087e-06, "objective/entropy": 53.43251037597656, "objective/kl": 30.13711929321289, "objective/non_score_reward": -3.013711929321289, "objective/rlhf_reward": -7.654847121238708, "objective/scores": 1.1, "policy/approxkl_avg": 24.648468017578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5256083607673645, "step": 378, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0001864433288574 }, { "episode": 6080, "epoch": 0.10928568860768595, "loss/policy_avg": 0.10543081164360046, "lr": 9.757796523517384e-06, "objective/entropy": 216.22293090820312, "objective/kl": 33.44567108154297, "objective/non_score_reward": -3.344566822052002, "objective/rlhf_reward": -11.99966583499084, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 30.81055450439453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5716228485107422, "step": 379, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9968583583831787 }, { "episode": 6096, "epoch": 0.10957328252507459, "loss/policy_avg": 0.3527596592903137, "lr": 9.75715746421268e-06, "objective/entropy": -127.59818267822266, "objective/kl": 31.49237632751465, "objective/non_score_reward": -3.149237632751465, "objective/rlhf_reward": -12.596950769424438, "objective/scores": 0.0, "policy/approxkl_avg": 19.017166137695312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4384676218032837, "step": 380, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995179176330566 }, { "episode": 6112, "epoch": 0.10986087644246324, "loss/policy_avg": 0.9311287999153137, "lr": 9.756518404907976e-06, "objective/entropy": 117.0103530883789, "objective/kl": 30.302433013916016, "objective/non_score_reward": -3.030243158340454, "objective/rlhf_reward": -10.459113364637481, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 65.951171875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.41777661442756653, "step": 381, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992172718048096 }, { "episode": 6128, "epoch": 0.11014847035985188, "loss/policy_avg": 0.027314603328704834, "lr": 9.755879345603273e-06, "objective/entropy": 82.98536682128906, "objective/kl": 41.457672119140625, "objective/non_score_reward": -4.1457672119140625, "objective/rlhf_reward": -15.241434386282592, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.37526273727417, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7102963924407959, "step": 382, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000887632369995 }, { "episode": 6144, "epoch": 0.11043606427724054, "loss/policy_avg": -0.5239760279655457, "lr": 9.75524028629857e-06, "objective/entropy": -19.319984436035156, "objective/kl": 31.706575393676758, "objective/non_score_reward": -3.1706576347351074, "objective/rlhf_reward": -10.282630062103273, "objective/scores": 0.6, "policy/approxkl_avg": 33.74637222290039, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.521426796913147, "step": 383, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0061190128326416 }, { "episode": 6160, "epoch": 0.11072365819462919, "loss/policy_avg": 0.19491565227508545, "lr": 9.754601226993867e-06, "objective/entropy": 153.27801513671875, "objective/kl": 30.898479461669922, "objective/non_score_reward": -3.0898478031158447, "objective/rlhf_reward": -10.84361919144028, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 24.972707748413086, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6943016648292542, "step": 384, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995365142822266 }, { "episode": 6176, "epoch": 0.11101125211201783, "loss/policy_avg": 0.9045780897140503, "lr": 9.753962167689162e-06, "objective/entropy": 229.45260620117188, "objective/kl": 45.034461975097656, "objective/non_score_reward": -4.503446578979492, "objective/rlhf_reward": -16.563187460513458, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.020683288574219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5668317079544067, "step": 385, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000062942504883 }, { "episode": 6192, "epoch": 0.11129884602940648, "loss/policy_avg": 0.33030185103416443, "lr": 9.753323108384459e-06, "objective/entropy": 153.65707397460938, "objective/kl": 42.31884002685547, "objective/non_score_reward": -4.231884002685547, "objective/rlhf_reward": -15.371276705470635, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 176.17214965820312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6527254581451416, "step": 386, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980921745300293 }, { "episode": 6208, "epoch": 0.11158643994679512, "loss/policy_avg": 1.2582824230194092, "lr": 9.752684049079756e-06, "objective/entropy": 212.47308349609375, "objective/kl": 41.99869918823242, "objective/non_score_reward": -4.1998701095581055, "objective/rlhf_reward": -14.852069209294257, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 34.943233489990234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.551336407661438, "step": 387, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.997947335243225 }, { "episode": 6224, "epoch": 0.11187403386418378, "loss/policy_avg": 1.361016035079956, "lr": 9.752044989775053e-06, "objective/entropy": -335.09619140625, "objective/kl": 30.397010803222656, "objective/non_score_reward": -3.039701223373413, "objective/rlhf_reward": -10.333976383480142, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 14.473678588867188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9271190166473389, "step": 388, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9977775812149048 }, { "episode": 6240, "epoch": 0.11216162778157242, "loss/policy_avg": 0.34025201201438904, "lr": 9.751405930470348e-06, "objective/entropy": 50.92825698852539, "objective/kl": 39.54961013793945, "objective/non_score_reward": -3.954960823059082, "objective/rlhf_reward": -13.995015259059976, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 8.61404037475586, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47977396845817566, "step": 389, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9968550205230713 }, { "episode": 6256, "epoch": 0.11244922169896107, "loss/policy_avg": 0.012692228890955448, "lr": 9.750766871165645e-06, "objective/entropy": -33.92766571044922, "objective/kl": 31.518718719482422, "objective/non_score_reward": -3.151872158050537, "objective/rlhf_reward": -8.207488393783569, "objective/scores": 1.1, "policy/approxkl_avg": 84.33369445800781, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5067895650863647, "step": 390, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976071119308472 }, { "episode": 6272, "epoch": 0.11273681561634971, "loss/policy_avg": 0.5984074473381042, "lr": 9.750127811860941e-06, "objective/entropy": -239.443359375, "objective/kl": 31.10334014892578, "objective/non_score_reward": -3.1103343963623047, "objective/rlhf_reward": -10.041337525844575, "objective/scores": 0.6, "policy/approxkl_avg": 30.063674926757812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7201836705207825, "step": 391, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000786304473877 }, { "episode": 6288, "epoch": 0.11302440953373837, "loss/policy_avg": 0.7581092715263367, "lr": 9.749488752556238e-06, "objective/entropy": 85.20730590820312, "objective/kl": 40.380855560302734, "objective/non_score_reward": -4.038085460662842, "objective/rlhf_reward": -14.701744298549041, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 18.875045776367188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5445913672447205, "step": 392, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985957145690918 }, { "episode": 6304, "epoch": 0.113312003451127, "loss/policy_avg": 1.7639085054397583, "lr": 9.748849693251534e-06, "objective/entropy": 124.08705139160156, "objective/kl": 37.808753967285156, "objective/non_score_reward": -3.7808759212493896, "objective/rlhf_reward": -13.298674936565469, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 16.500898361206055, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4794216752052307, "step": 393, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0014162063598633 }, { "episode": 6320, "epoch": 0.11359959736851566, "loss/policy_avg": 0.012201100587844849, "lr": 9.74821063394683e-06, "objective/entropy": 200.1130828857422, "objective/kl": 30.82569122314453, "objective/non_score_reward": -3.082569122314453, "objective/rlhf_reward": -7.930276489257812, "objective/scores": 1.1, "policy/approxkl_avg": 7.863556861877441, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5342352390289307, "step": 394, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999612808227539 }, { "episode": 6336, "epoch": 0.1138871912859043, "loss/policy_avg": 2.2059273719787598, "lr": 9.747571574642127e-06, "objective/entropy": -69.09872436523438, "objective/kl": 40.18467330932617, "objective/non_score_reward": -4.018467426300049, "objective/rlhf_reward": -14.650037605960932, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 23.521875381469727, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5724920034408569, "step": 395, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997222423553467 }, { "episode": 6352, "epoch": 0.11417478520329295, "loss/policy_avg": 0.4041597843170166, "lr": 9.746932515337424e-06, "objective/entropy": -215.51731872558594, "objective/kl": 27.624664306640625, "objective/non_score_reward": -2.7624664306640625, "objective/rlhf_reward": -11.049865961074829, "objective/scores": 0.0, "policy/approxkl_avg": 39.29521560668945, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6042770743370056, "step": 396, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989702701568604 }, { "episode": 6368, "epoch": 0.11446237912068159, "loss/policy_avg": 0.4775196313858032, "lr": 9.746293456032721e-06, "objective/entropy": 41.82182693481445, "objective/kl": 36.31709289550781, "objective/non_score_reward": -3.631709575653076, "objective/rlhf_reward": -12.702010269435952, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 44.893619537353516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.37915006279945374, "step": 397, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001455783843994 }, { "episode": 6384, "epoch": 0.11474997303807025, "loss/policy_avg": 0.056639641523361206, "lr": 9.745654396728016e-06, "objective/entropy": -153.1647186279297, "objective/kl": 32.43135452270508, "objective/non_score_reward": -3.243135452270508, "objective/rlhf_reward": -11.310682540357696, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.9430336952209473, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7443736791610718, "step": 398, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000112533569336 }, { "episode": 6400, "epoch": 0.11503756695545889, "loss/policy_avg": 0.045253098011016846, "lr": 9.745015337423313e-06, "objective/entropy": -105.165283203125, "objective/kl": 39.292572021484375, "objective/non_score_reward": -3.929257392883301, "objective/rlhf_reward": -14.236076953823925, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 31.803394317626953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.41279107332229614, "step": 399, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001852512359619 }, { "episode": 6416, "epoch": 0.11532516087284754, "loss/policy_avg": 1.3353252410888672, "lr": 9.74437627811861e-06, "objective/entropy": 56.36566925048828, "objective/kl": 36.79115676879883, "objective/non_score_reward": -3.6791152954101562, "objective/rlhf_reward": -12.983128563563028, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 52.49983215332031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5429282188415527, "step": 400, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969511032104492 }, { "episode": 6432, "epoch": 0.11561275479023618, "loss/policy_avg": 0.19346949458122253, "lr": 9.743737218813907e-06, "objective/entropy": 94.13348388671875, "objective/kl": 33.9053840637207, "objective/non_score_reward": -3.3905386924743652, "objective/rlhf_reward": -12.111556748957977, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 10.45969009399414, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5399774312973022, "step": 401, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9987437725067139 }, { "episode": 6448, "epoch": 0.11590034870762483, "loss/policy_avg": 0.14212624728679657, "lr": 9.743098159509204e-06, "objective/entropy": -67.64189147949219, "objective/kl": 23.04766273498535, "objective/non_score_reward": -2.3047664165496826, "objective/rlhf_reward": -7.662806778159693, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 17.699844360351562, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5857589244842529, "step": 402, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000443696975708 }, { "episode": 6464, "epoch": 0.11618794262501349, "loss/policy_avg": 2.842088222503662, "lr": 9.7424591002045e-06, "objective/entropy": 104.11701965332031, "objective/kl": 37.51358413696289, "objective/non_score_reward": -3.7513585090637207, "objective/rlhf_reward": -12.081715498806211, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 18.802593231201172, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8919892311096191, "step": 403, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999840497970581 }, { "episode": 6480, "epoch": 0.11647553654240213, "loss/policy_avg": 3.926600456237793, "lr": 9.741820040899796e-06, "objective/entropy": -60.85142517089844, "objective/kl": 39.3304557800293, "objective/non_score_reward": -3.9330458641052246, "objective/rlhf_reward": -15.732182502746582, "objective/scores": 0.0, "policy/approxkl_avg": 15.211052894592285, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6057410836219788, "step": 404, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988353252410889 }, { "episode": 6496, "epoch": 0.11676313045979078, "loss/policy_avg": 0.7047057747840881, "lr": 9.741180981595093e-06, "objective/entropy": 86.78068542480469, "objective/kl": 32.590457916259766, "objective/non_score_reward": -3.2590458393096924, "objective/rlhf_reward": -10.913477124945196, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 73.14445495605469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5404595136642456, "step": 405, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992141723632812 }, { "episode": 6512, "epoch": 0.11705072437717942, "loss/policy_avg": 0.7668646574020386, "lr": 9.74054192229039e-06, "objective/entropy": 9.115959167480469, "objective/kl": 35.6148796081543, "objective/non_score_reward": -3.561488389968872, "objective/rlhf_reward": -12.123247566000494, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 24.980825424194336, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.684908390045166, "step": 406, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991612434387207 }, { "episode": 6528, "epoch": 0.11733831829456808, "loss/policy_avg": 0.901952862739563, "lr": 9.739902862985686e-06, "objective/entropy": 47.42900848388672, "objective/kl": 36.136173248291016, "objective/non_score_reward": -3.613617420196533, "objective/rlhf_reward": -12.72113610903422, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 29.850797653198242, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5216494202613831, "step": 407, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996336698532104 }, { "episode": 6544, "epoch": 0.11762591221195672, "loss/policy_avg": 0.4201366901397705, "lr": 9.739263803680983e-06, "objective/entropy": -11.0733642578125, "objective/kl": 35.00093078613281, "objective/non_score_reward": -3.5000932216644287, "objective/rlhf_reward": -9.600372886657714, "objective/scores": 1.1, "policy/approxkl_avg": 32.18763732910156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5420930981636047, "step": 408, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997605323791504 }, { "episode": 6560, "epoch": 0.11791350612934537, "loss/policy_avg": 1.302764892578125, "lr": 9.73862474437628e-06, "objective/entropy": 168.5387420654297, "objective/kl": 26.525001525878906, "objective/non_score_reward": -2.6525001525878906, "objective/rlhf_reward": -9.094229185374912, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 59.64923858642578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.529288649559021, "step": 409, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986768960952759 }, { "episode": 6576, "epoch": 0.11820110004673401, "loss/policy_avg": 1.0619229078292847, "lr": 9.737985685071575e-06, "objective/entropy": -54.82817459106445, "objective/kl": 37.211219787597656, "objective/non_score_reward": -3.7211220264434814, "objective/rlhf_reward": -13.328228085246636, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 47.928985595703125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5946022272109985, "step": 410, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001032590866089 }, { "episode": 6592, "epoch": 0.11848869396412266, "loss/policy_avg": 0.4641076922416687, "lr": 9.737346625766872e-06, "objective/entropy": 80.71646881103516, "objective/kl": 35.40373992919922, "objective/non_score_reward": -3.5403738021850586, "objective/rlhf_reward": -12.680542829449536, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 4.019253730773926, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5870873928070068, "step": 411, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998197555541992 }, { "episode": 6608, "epoch": 0.1187762878815113, "loss/policy_avg": 0.3565133213996887, "lr": 9.736707566462167e-06, "objective/entropy": 122.7892074584961, "objective/kl": 39.498130798339844, "objective/non_score_reward": -3.9498136043548584, "objective/rlhf_reward": -13.67654818512586, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 63.53807830810547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.581372857093811, "step": 412, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0015203952789307 }, { "episode": 6624, "epoch": 0.11906388179889996, "loss/policy_avg": 0.14506877958774567, "lr": 9.736068507157464e-06, "objective/entropy": 193.5592041015625, "objective/kl": 30.521562576293945, "objective/non_score_reward": -3.052156448364258, "objective/rlhf_reward": -10.475291983286539, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 22.271638870239258, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7713235020637512, "step": 413, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975152015686035 }, { "episode": 6640, "epoch": 0.1193514757162886, "loss/policy_avg": 0.9468994736671448, "lr": 9.735429447852761e-06, "objective/entropy": 148.8424835205078, "objective/kl": 37.5145378112793, "objective/non_score_reward": -3.7514538764953613, "objective/rlhf_reward": -15.005815267562866, "objective/scores": 0.0, "policy/approxkl_avg": 9.498788833618164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.38916516304016113, "step": 414, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000758171081543 }, { "episode": 6656, "epoch": 0.11963906963367725, "loss/policy_avg": 0.7254658937454224, "lr": 9.734790388548058e-06, "objective/entropy": 56.421714782714844, "objective/kl": 33.228389739990234, "objective/non_score_reward": -3.32283878326416, "objective/rlhf_reward": -11.46652638462455, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 17.776447296142578, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6019710302352905, "step": 415, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001137256622314 }, { "episode": 6672, "epoch": 0.11992666355106589, "loss/policy_avg": 0.12397602200508118, "lr": 9.734151329243355e-06, "objective/entropy": -148.2471466064453, "objective/kl": 25.882095336914062, "objective/non_score_reward": -2.588209629058838, "objective/rlhf_reward": -8.974236705390316, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.8484023809432983, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4417728781700134, "step": 416, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002113103866577 }, { "episode": 6688, "epoch": 0.12021425746845454, "loss/policy_avg": -0.03540700674057007, "lr": 9.73351226993865e-06, "objective/entropy": -65.22505187988281, "objective/kl": 25.781585693359375, "objective/non_score_reward": -2.5781586170196533, "objective/rlhf_reward": -10.312634468078613, "objective/scores": 0.0, "policy/approxkl_avg": 0.9484915733337402, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.44973552227020264, "step": 417, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.003369092941284 }, { "episode": 6704, "epoch": 0.12050185138584318, "loss/policy_avg": 2.237513303756714, "lr": 9.732873210633947e-06, "objective/entropy": 72.41790008544922, "objective/kl": 41.708648681640625, "objective/non_score_reward": -4.170865058898926, "objective/rlhf_reward": -13.759741698147032, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 6.452242851257324, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.46763697266578674, "step": 418, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999596357345581 }, { "episode": 6720, "epoch": 0.12078944530323184, "loss/policy_avg": -0.033215656876564026, "lr": 9.732234151329244e-06, "objective/entropy": 116.18624877929688, "objective/kl": 41.70143508911133, "objective/non_score_reward": -4.170144081115723, "objective/rlhf_reward": -15.018715386808502, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 0.950503408908844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5579153299331665, "step": 419, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0016934871673584 }, { "episode": 6736, "epoch": 0.12107703922062048, "loss/policy_avg": 0.5230793952941895, "lr": 9.73159509202454e-06, "objective/entropy": 87.67442321777344, "objective/kl": 42.121944427490234, "objective/non_score_reward": -4.212194442749023, "objective/rlhf_reward": -15.523264799147768, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 44.811546325683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5887485146522522, "step": 420, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986464977264404 }, { "episode": 6752, "epoch": 0.12136463313800913, "loss/policy_avg": 0.09617140889167786, "lr": 9.730956032719838e-06, "objective/entropy": 197.31307983398438, "objective/kl": 41.32299041748047, "objective/non_score_reward": -4.132298469543457, "objective/rlhf_reward": -15.129195547103883, "objective/scores": 0.35, "policy/approxkl_avg": 7.07308292388916, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5032777786254883, "step": 421, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0003557205200195 }, { "episode": 6768, "epoch": 0.12165222705539779, "loss/policy_avg": 0.0820683017373085, "lr": 9.730316973415135e-06, "objective/entropy": 90.92608642578125, "objective/kl": 33.22870635986328, "objective/non_score_reward": -3.3228707313537598, "objective/rlhf_reward": -11.735222904887749, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.888459205627441, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6309401392936707, "step": 422, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998487949371338 }, { "episode": 6784, "epoch": 0.12193982097278643, "loss/policy_avg": 0.13335853815078735, "lr": 9.72967791411043e-06, "objective/entropy": 58.8111686706543, "objective/kl": 17.325424194335938, "objective/non_score_reward": -1.7325425148010254, "objective/rlhf_reward": -5.196836725870767, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.40964412689209, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4101444184780121, "step": 423, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999779462814331 }, { "episode": 6800, "epoch": 0.12222741489017508, "loss/policy_avg": 1.1839892864227295, "lr": 9.729038854805727e-06, "objective/entropy": 278.55230712890625, "objective/kl": 36.13326644897461, "objective/non_score_reward": -3.6133267879486084, "objective/rlhf_reward": -12.7199738184611, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 37.6474609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6956678628921509, "step": 424, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999556303024292 }, { "episode": 6816, "epoch": 0.12251500880756372, "loss/policy_avg": 0.5160382390022278, "lr": 9.728399795501023e-06, "objective/entropy": -4.561044692993164, "objective/kl": 48.20618438720703, "objective/non_score_reward": -4.820618152618408, "objective/rlhf_reward": -17.801520469601513, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 29.267677307128906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6663553714752197, "step": 425, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9961378574371338 }, { "episode": 6832, "epoch": 0.12280260272495237, "loss/policy_avg": -0.027832061052322388, "lr": 9.72776073619632e-06, "objective/entropy": -14.169868469238281, "objective/kl": 28.816591262817383, "objective/non_score_reward": -2.8816590309143066, "objective/rlhf_reward": -10.126636123657228, "objective/scores": 0.35, "policy/approxkl_avg": 10.623421669006348, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.42182457447052, "step": 426, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0000364780426025 }, { "episode": 6848, "epoch": 0.12309019664234101, "loss/policy_avg": 0.9478355050086975, "lr": 9.727121676891617e-06, "objective/entropy": -48.67333221435547, "objective/kl": 22.937318801879883, "objective/non_score_reward": -2.293731927871704, "objective/rlhf_reward": -7.227516422944005, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.106391191482544, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6140183210372925, "step": 427, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0030503273010254 }, { "episode": 6864, "epoch": 0.12337779055972967, "loss/policy_avg": 0.6610305309295654, "lr": 9.726482617586912e-06, "objective/entropy": 80.99835968017578, "objective/kl": 39.61425018310547, "objective/non_score_reward": -3.961425304412842, "objective/rlhf_reward": -14.520187649756593, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.697940349578857, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5119843482971191, "step": 428, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000978946685791 }, { "episode": 6880, "epoch": 0.1236653844771183, "loss/policy_avg": 0.11895343661308289, "lr": 9.72584355828221e-06, "objective/entropy": 162.822021484375, "objective/kl": 44.34868621826172, "objective/non_score_reward": -4.434868812561035, "objective/rlhf_reward": -16.339474773406984, "objective/scores": 0.35, "policy/approxkl_avg": 4.151267051696777, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5969315767288208, "step": 429, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984312057495117 }, { "episode": 6896, "epoch": 0.12395297839450696, "loss/policy_avg": 0.5579686164855957, "lr": 9.725204498977506e-06, "objective/entropy": -17.16387367248535, "objective/kl": 37.852745056152344, "objective/non_score_reward": -3.7852747440338135, "objective/rlhf_reward": -10.741099214553834, "objective/scores": 1.1, "policy/approxkl_avg": 41.654693603515625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4921834468841553, "step": 430, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981064796447754 }, { "episode": 6912, "epoch": 0.1242405723118956, "loss/policy_avg": 0.15593896806240082, "lr": 9.724565439672803e-06, "objective/entropy": 149.734130859375, "objective/kl": 25.60231590270996, "objective/non_score_reward": -2.5602316856384277, "objective/rlhf_reward": -8.416097994121621, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 30.770790100097656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4205164313316345, "step": 431, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001136541366577 }, { "episode": 6928, "epoch": 0.12452816622928425, "loss/policy_avg": 0.7011826038360596, "lr": 9.7239263803681e-06, "objective/entropy": 55.692283630371094, "objective/kl": 43.931175231933594, "objective/non_score_reward": -4.393117904663086, "objective/rlhf_reward": -15.83913828531901, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 20.95511245727539, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3865165710449219, "step": 432, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988900423049927 }, { "episode": 6944, "epoch": 0.1248157601466729, "loss/policy_avg": 1.022209882736206, "lr": 9.723287321063397e-06, "objective/entropy": 83.84861755371094, "objective/kl": 42.09056854248047, "objective/non_score_reward": -4.209057331085205, "objective/rlhf_reward": -15.011400456699441, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 58.270423889160156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6480612754821777, "step": 433, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980558156967163 }, { "episode": 6960, "epoch": 0.12510335406406153, "loss/policy_avg": 0.5657510757446289, "lr": 9.722648261758692e-06, "objective/entropy": 115.53985595703125, "objective/kl": 33.222572326660156, "objective/non_score_reward": -3.3222572803497314, "objective/rlhf_reward": -11.684908661905844, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 22.127004623413086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8776997923851013, "step": 434, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9980456829071045 }, { "episode": 6976, "epoch": 0.1253909479814502, "loss/policy_avg": 0.861635684967041, "lr": 9.722009202453989e-06, "objective/entropy": 191.2237548828125, "objective/kl": 33.726585388183594, "objective/non_score_reward": -3.3726587295532227, "objective/rlhf_reward": -11.757301584879556, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 66.25660705566406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5842768549919128, "step": 435, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004849433898926 }, { "episode": 6992, "epoch": 0.12567854189883884, "loss/policy_avg": 0.30258873105049133, "lr": 9.721370143149284e-06, "objective/entropy": 179.46835327148438, "objective/kl": 39.91570281982422, "objective/non_score_reward": -3.9915707111358643, "objective/rlhf_reward": -14.362163100306113, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.522405624389648, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.45521217584609985, "step": 436, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9980738162994385 }, { "episode": 7008, "epoch": 0.12596613581622748, "loss/policy_avg": 0.9346391558647156, "lr": 9.720731083844581e-06, "objective/entropy": 37.353126525878906, "objective/kl": 43.99368667602539, "objective/non_score_reward": -4.3993682861328125, "objective/rlhf_reward": -16.11652124207771, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.334951877593994, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7622473835945129, "step": 437, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002927780151367 }, { "episode": 7024, "epoch": 0.12625372973361612, "loss/policy_avg": 1.3644543886184692, "lr": 9.720092024539878e-06, "objective/entropy": -80.11536407470703, "objective/kl": 26.775297164916992, "objective/non_score_reward": -2.677529811859131, "objective/rlhf_reward": -9.33151695975433, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 41.969295501708984, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.37151363492012024, "step": 438, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997823238372803 }, { "episode": 7040, "epoch": 0.1265413236510048, "loss/policy_avg": -0.03804589435458183, "lr": 9.719452965235175e-06, "objective/entropy": -68.57923889160156, "objective/kl": 41.42705535888672, "objective/non_score_reward": -4.14270544052124, "objective/rlhf_reward": -15.170821285247804, "objective/scores": 0.35, "policy/approxkl_avg": 0.610215425491333, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.3907659649848938, "step": 439, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002786636352539 }, { "episode": 7056, "epoch": 0.12682891756839343, "loss/policy_avg": 0.9513897895812988, "lr": 9.718813905930472e-06, "objective/entropy": 176.7696533203125, "objective/kl": 32.5645751953125, "objective/non_score_reward": -3.2564573287963867, "objective/rlhf_reward": -11.666579568122309, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 84.38311004638672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.43554389476776123, "step": 440, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9961769580841064 }, { "episode": 7072, "epoch": 0.12711651148578207, "loss/policy_avg": 1.6144888401031494, "lr": 9.718174846625767e-06, "objective/entropy": -14.703704833984375, "objective/kl": 21.40297508239746, "objective/non_score_reward": -2.1402974128723145, "objective/rlhf_reward": -6.1611901283264165, "objective/scores": 0.6, "policy/approxkl_avg": 90.28463745117188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6536741852760315, "step": 441, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973065853118896 }, { "episode": 7088, "epoch": 0.12740410540317074, "loss/policy_avg": -0.564086377620697, "lr": 9.717535787321064e-06, "objective/entropy": -92.54092407226562, "objective/kl": 27.47213363647461, "objective/non_score_reward": -2.747213363647461, "objective/rlhf_reward": -9.432593672481135, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 24.76102066040039, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4654249846935272, "step": 442, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999948263168335 }, { "episode": 7104, "epoch": 0.12769169932055938, "loss/policy_avg": 0.7788177728652954, "lr": 9.71689672801636e-06, "objective/entropy": -28.373756408691406, "objective/kl": 35.91747283935547, "objective/non_score_reward": -3.591747283935547, "objective/rlhf_reward": -13.025353243857055, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 27.097518920898438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.38184916973114014, "step": 443, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997880220413208 }, { "episode": 7120, "epoch": 0.12797929323794802, "loss/policy_avg": 0.3843851685523987, "lr": 9.716257668711657e-06, "objective/entropy": 140.32058715820312, "objective/kl": 37.66426467895508, "objective/non_score_reward": -3.7664265632629395, "objective/rlhf_reward": -13.332372681299844, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 19.241600036621094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.349905788898468, "step": 444, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0018460750579834 }, { "episode": 7136, "epoch": 0.12826688715533666, "loss/policy_avg": 0.540947437286377, "lr": 9.715618609406954e-06, "objective/entropy": 148.06629943847656, "objective/kl": 42.55817413330078, "objective/non_score_reward": -4.25581693649292, "objective/rlhf_reward": -15.66401835653631, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 75.46281433105469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.541397213935852, "step": 445, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983032941818237 }, { "episode": 7152, "epoch": 0.12855448107272532, "loss/policy_avg": 1.184004306793213, "lr": 9.714979550102251e-06, "objective/entropy": 84.38250732421875, "objective/kl": 33.90479278564453, "objective/non_score_reward": -3.390479564666748, "objective/rlhf_reward": -12.220281770735411, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 51.5472412109375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5456966161727905, "step": 446, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9997706413269043 }, { "episode": 7168, "epoch": 0.12884207499011396, "loss/policy_avg": 1.1816997528076172, "lr": 9.714340490797546e-06, "objective/entropy": -57.552371978759766, "objective/kl": 30.747276306152344, "objective/non_score_reward": -3.074728012084961, "objective/rlhf_reward": -7.898911571502686, "objective/scores": 1.1, "policy/approxkl_avg": 35.125282287597656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.30223560333251953, "step": 447, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9982578754425049 }, { "episode": 7184, "epoch": 0.1291296689075026, "loss/policy_avg": 0.3517414927482605, "lr": 9.713701431492843e-06, "objective/entropy": 172.75254821777344, "objective/kl": 32.79669189453125, "objective/non_score_reward": -3.2796695232391357, "objective/rlhf_reward": -11.456818585813629, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 27.51565170288086, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4375608265399933, "step": 448, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9987891912460327 }, { "episode": 7200, "epoch": 0.12941726282489124, "loss/policy_avg": 0.3732157051563263, "lr": 9.71306237218814e-06, "objective/entropy": 71.94863891601562, "objective/kl": 37.338172912597656, "objective/non_score_reward": -3.7338175773620605, "objective/rlhf_reward": -12.535269832611085, "objective/scores": 0.6, "policy/approxkl_avg": 85.17916870117188, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.43261417746543884, "step": 449, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99893057346344 }, { "episode": 7216, "epoch": 0.1297048567422799, "loss/policy_avg": 1.4543174505233765, "lr": 9.712423312883437e-06, "objective/entropy": 258.2192687988281, "objective/kl": 49.02899169921875, "objective/non_score_reward": -4.902898788452148, "objective/rlhf_reward": -18.286083135634584, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 21.199018478393555, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6569823026657104, "step": 450, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.996167778968811 }, { "episode": 7232, "epoch": 0.12999245065966855, "loss/policy_avg": 0.49340057373046875, "lr": 9.711784253578734e-06, "objective/entropy": -43.6832160949707, "objective/kl": 33.31085968017578, "objective/non_score_reward": -3.3310861587524414, "objective/rlhf_reward": -11.87374721011673, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 28.588882446289062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6205878257751465, "step": 451, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999979019165039 }, { "episode": 7248, "epoch": 0.1302800445770572, "loss/policy_avg": 0.6000991463661194, "lr": 9.711145194274029e-06, "objective/entropy": 90.76286315917969, "objective/kl": 45.31011962890625, "objective/non_score_reward": -4.531011581420898, "objective/rlhf_reward": -16.745444872466425, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 113.64555358886719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5617672204971313, "step": 452, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9989123344421387 }, { "episode": 7264, "epoch": 0.13056763849444583, "loss/policy_avg": 1.042801022529602, "lr": 9.710506134969326e-06, "objective/entropy": 189.15316772460938, "objective/kl": 36.73876953125, "objective/non_score_reward": -3.673877477645874, "objective/rlhf_reward": -13.295509910583498, "objective/scores": 0.35, "policy/approxkl_avg": 21.024385452270508, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4428805708885193, "step": 453, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9950382709503174 }, { "episode": 7280, "epoch": 0.1308552324118345, "loss/policy_avg": 0.6862419843673706, "lr": 9.709867075664623e-06, "objective/entropy": -38.022178649902344, "objective/kl": 48.085838317871094, "objective/non_score_reward": -4.808583736419678, "objective/rlhf_reward": -17.71856340149277, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 89.67850494384766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5650781393051147, "step": 454, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975779056549072 }, { "episode": 7296, "epoch": 0.13114282632922314, "loss/policy_avg": -0.21822161972522736, "lr": 9.70922801635992e-06, "objective/entropy": 24.07161521911621, "objective/kl": 39.552284240722656, "objective/non_score_reward": -3.955228328704834, "objective/rlhf_reward": -14.339960935528637, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 4.527206897735596, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.32756006717681885, "step": 455, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0025227069854736 }, { "episode": 7312, "epoch": 0.13143042024661178, "loss/policy_avg": 0.050335630774497986, "lr": 9.708588957055215e-06, "objective/entropy": 47.709957122802734, "objective/kl": 34.94654083251953, "objective/non_score_reward": -3.4946541786193848, "objective/rlhf_reward": -11.054897938610289, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 20.525684356689453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.40393322706222534, "step": 456, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999319314956665 }, { "episode": 7328, "epoch": 0.13171801416400042, "loss/policy_avg": -0.2107769250869751, "lr": 9.707949897750512e-06, "objective/entropy": 4.4464111328125, "objective/kl": 30.348583221435547, "objective/non_score_reward": -3.034858226776123, "objective/rlhf_reward": -10.760830977050167, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.349994659423828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4865412712097168, "step": 457, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.025125503540039 }, { "episode": 7344, "epoch": 0.1320056080813891, "loss/policy_avg": 0.9970263242721558, "lr": 9.707310838445809e-06, "objective/entropy": 197.11566162109375, "objective/kl": 38.80963897705078, "objective/non_score_reward": -3.8809640407562256, "objective/rlhf_reward": -14.008084261211092, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 64.41117858886719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47768014669418335, "step": 458, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9979461431503296 }, { "episode": 7360, "epoch": 0.13229320199877773, "loss/policy_avg": 0.19499164819717407, "lr": 9.706671779141105e-06, "objective/entropy": 109.55068969726562, "objective/kl": 34.07399368286133, "objective/non_score_reward": -3.4073991775512695, "objective/rlhf_reward": -13.629597425460815, "objective/scores": 0.0, "policy/approxkl_avg": 8.495365142822266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.565830647945404, "step": 459, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999122977256775 }, { "episode": 7376, "epoch": 0.13258079591616637, "loss/policy_avg": 0.5532440543174744, "lr": 9.7060327198364e-06, "objective/entropy": 83.35699462890625, "objective/kl": 32.4083251953125, "objective/non_score_reward": -3.240832567214966, "objective/rlhf_reward": -11.359210524622519, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.45071268081665, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4963003993034363, "step": 460, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.011878490447998 }, { "episode": 7392, "epoch": 0.13286838983355503, "loss/policy_avg": 0.552447497844696, "lr": 9.705393660531698e-06, "objective/entropy": 160.72750854492188, "objective/kl": 47.16038131713867, "objective/non_score_reward": -4.716038227081299, "objective/rlhf_reward": -17.440320809085932, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 79.7755126953125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3849433958530426, "step": 461, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9991737604141235 }, { "episode": 7408, "epoch": 0.13315598375094367, "loss/policy_avg": 1.8305895328521729, "lr": 9.704754601226994e-06, "objective/entropy": 148.47381591796875, "objective/kl": 32.803104400634766, "objective/non_score_reward": -3.280310869216919, "objective/rlhf_reward": -11.779607584982543, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 4.440328121185303, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.34545716643333435, "step": 462, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0012433528900146 }, { "episode": 7424, "epoch": 0.1334435776683323, "loss/policy_avg": -0.21606217324733734, "lr": 9.704115541922291e-06, "objective/entropy": -211.98297119140625, "objective/kl": 22.90569305419922, "objective/non_score_reward": -2.2905690670013428, "objective/rlhf_reward": -7.337447400363992, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 3.513453483581543, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6393148899078369, "step": 463, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0019214153289795 }, { "episode": 7440, "epoch": 0.13373117158572095, "loss/policy_avg": 1.7707817554473877, "lr": 9.703476482617588e-06, "objective/entropy": -42.62212371826172, "objective/kl": 38.86042022705078, "objective/non_score_reward": -3.8860418796539307, "objective/rlhf_reward": -13.987908928599907, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.585241317749023, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5668050050735474, "step": 464, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999806880950928 }, { "episode": 7456, "epoch": 0.13401876550310962, "loss/policy_avg": 0.5606961250305176, "lr": 9.702837423312883e-06, "objective/entropy": 46.48912811279297, "objective/kl": 41.47301483154297, "objective/non_score_reward": -4.14730167388916, "objective/rlhf_reward": -14.18920729160309, "objective/scores": 0.6, "policy/approxkl_avg": 15.323009490966797, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4189653694629669, "step": 465, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.002035617828369 }, { "episode": 7472, "epoch": 0.13430635942049826, "loss/policy_avg": 0.3116866946220398, "lr": 9.70219836400818e-06, "objective/entropy": 202.82122802734375, "objective/kl": 30.228025436401367, "objective/non_score_reward": -3.0228028297424316, "objective/rlhf_reward": -10.266382093700479, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 3.352916955947876, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5167281627655029, "step": 466, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0003514289855957 }, { "episode": 7488, "epoch": 0.1345939533378869, "loss/policy_avg": 0.9980499148368835, "lr": 9.701559304703477e-06, "objective/entropy": 127.01738739013672, "objective/kl": 39.83085632324219, "objective/non_score_reward": -3.9830856323242188, "objective/rlhf_reward": -14.45138919633186, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 25.166885375976562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.736939549446106, "step": 467, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994404315948486 }, { "episode": 7504, "epoch": 0.13488154725527554, "loss/policy_avg": 0.21544580161571503, "lr": 9.700920245398774e-06, "objective/entropy": 233.09375, "objective/kl": 32.72058868408203, "objective/non_score_reward": -3.272059202194214, "objective/rlhf_reward": -11.72898670408575, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 78.07327270507812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.62492835521698, "step": 468, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9992430210113525 }, { "episode": 7520, "epoch": 0.1351691411726642, "loss/policy_avg": 0.4316645860671997, "lr": 9.700281186094071e-06, "objective/entropy": -37.32112121582031, "objective/kl": 29.643779754638672, "objective/non_score_reward": -2.9643778800964355, "objective/rlhf_reward": -10.406913261027679, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 80.32553100585938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8007365465164185, "step": 469, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981448650360107 }, { "episode": 7536, "epoch": 0.13545673509005285, "loss/policy_avg": -0.22065140306949615, "lr": 9.699642126789368e-06, "objective/entropy": -288.51220703125, "objective/kl": 31.09638023376465, "objective/non_score_reward": -3.109638214111328, "objective/rlhf_reward": -10.882293074336602, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 15.306625366210938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6064466238021851, "step": 470, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000288248062134 }, { "episode": 7552, "epoch": 0.1357443290074415, "loss/policy_avg": 0.7062017917633057, "lr": 9.699003067484663e-06, "objective/entropy": -185.96678161621094, "objective/kl": 38.07769012451172, "objective/non_score_reward": -3.8077688217163086, "objective/rlhf_reward": -12.307355795742247, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 21.195262908935547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6327893137931824, "step": 471, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9975873231887817 }, { "episode": 7568, "epoch": 0.13603192292483013, "loss/policy_avg": 0.21052365005016327, "lr": 9.69836400817996e-06, "objective/entropy": -114.1561050415039, "objective/kl": 38.60865020751953, "objective/non_score_reward": -3.8608651161193848, "objective/rlhf_reward": -15.443459749221802, "objective/scores": 0.0, "policy/approxkl_avg": 7.808056831359863, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.587588906288147, "step": 472, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999895453453064 }, { "episode": 7584, "epoch": 0.1363195168422188, "loss/policy_avg": 0.9177588224411011, "lr": 9.697724948875257e-06, "objective/entropy": 91.9778823852539, "objective/kl": 49.228004455566406, "objective/non_score_reward": -4.9228010177612305, "objective/rlhf_reward": -18.34956817915979, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 252.69491577148438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5273959636688232, "step": 473, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998390555381775 }, { "episode": 7600, "epoch": 0.13660711075960744, "loss/policy_avg": 0.2135259062051773, "lr": 9.697085889570554e-06, "objective/entropy": -91.76605224609375, "objective/kl": 20.413612365722656, "objective/non_score_reward": -2.0413613319396973, "objective/rlhf_reward": -6.741612751682368, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.6130738258361816, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4268151521682739, "step": 474, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9995267391204834 }, { "episode": 7616, "epoch": 0.13689470467699608, "loss/policy_avg": 0.7761150598526001, "lr": 9.69644683026585e-06, "objective/entropy": 25.679851531982422, "objective/kl": 40.76634979248047, "objective/non_score_reward": -4.076634883880615, "objective/rlhf_reward": -14.927937605468134, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 14.822543144226074, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7041196823120117, "step": 475, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984660148620605 }, { "episode": 7632, "epoch": 0.13718229859438472, "loss/policy_avg": 1.497192144393921, "lr": 9.695807770961146e-06, "objective/entropy": -73.21554565429688, "objective/kl": 31.698223114013672, "objective/non_score_reward": -3.1698226928710938, "objective/rlhf_reward": -10.731878827290473, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.129430770874023, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6742293834686279, "step": 476, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998528242111206 }, { "episode": 7648, "epoch": 0.13746989251177338, "loss/policy_avg": 0.7623737454414368, "lr": 9.695168711656443e-06, "objective/entropy": -212.29415893554688, "objective/kl": 26.89659881591797, "objective/non_score_reward": -2.689659833908081, "objective/rlhf_reward": -9.433126482993288, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 40.945072174072266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6739322543144226, "step": 477, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999362587928772 }, { "episode": 7664, "epoch": 0.13775748642916202, "loss/policy_avg": 0.8399478793144226, "lr": 9.694529652351738e-06, "objective/entropy": -28.784271240234375, "objective/kl": 46.6888542175293, "objective/non_score_reward": -4.668885707855225, "objective/rlhf_reward": -17.31629320356695, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 33.967262268066406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7294750213623047, "step": 478, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998927116394043 }, { "episode": 7680, "epoch": 0.13804508034655066, "loss/policy_avg": 0.48977339267730713, "lr": 9.693890593047035e-06, "objective/entropy": -103.34806823730469, "objective/kl": 35.017757415771484, "objective/non_score_reward": -3.5017752647399902, "objective/rlhf_reward": -12.583269674976435, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 104.5495376586914, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6413424015045166, "step": 479, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9937057495117188 }, { "episode": 7696, "epoch": 0.13833267426393933, "loss/policy_avg": 0.20745977759361267, "lr": 9.693251533742331e-06, "objective/entropy": 57.130958557128906, "objective/kl": 41.31460189819336, "objective/non_score_reward": -4.131460189819336, "objective/rlhf_reward": -15.010069215091402, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 30.427654266357422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5963334441184998, "step": 480, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0048885345458984 }, { "episode": 7712, "epoch": 0.13862026818132797, "loss/policy_avg": 0.15290355682373047, "lr": 9.692612474437628e-06, "objective/entropy": 106.57427978515625, "objective/kl": 41.791648864746094, "objective/non_score_reward": -4.179165363311768, "objective/rlhf_reward": -13.792941962124083, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 30.64380645751953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.521045446395874, "step": 481, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998927354812622 }, { "episode": 7728, "epoch": 0.1389078620987166, "loss/policy_avg": 0.7474868893623352, "lr": 9.691973415132925e-06, "objective/entropy": 228.7914581298828, "objective/kl": 35.901405334472656, "objective/non_score_reward": -3.5901405811309814, "objective/rlhf_reward": -11.960562801361085, "objective/scores": 0.6, "policy/approxkl_avg": 88.05641174316406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5807632207870483, "step": 482, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9998561143875122 }, { "episode": 7744, "epoch": 0.13919545601610525, "loss/policy_avg": 1.6704270839691162, "lr": 9.691334355828222e-06, "objective/entropy": -124.35450744628906, "objective/kl": 36.7768440246582, "objective/non_score_reward": -3.6776845455169678, "objective/rlhf_reward": -13.286906321247187, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 20.388381958007812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6395858526229858, "step": 483, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9977163076400757 }, { "episode": 7760, "epoch": 0.13948304993349392, "loss/policy_avg": 0.41683727502822876, "lr": 9.690695296523517e-06, "objective/entropy": 82.72738647460938, "objective/kl": 38.18916702270508, "objective/non_score_reward": -3.8189167976379395, "objective/rlhf_reward": -13.851835568149653, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 7.188452243804932, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5991804599761963, "step": 484, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0014636516571045 }, { "episode": 7776, "epoch": 0.13977064385088256, "loss/policy_avg": 0.9677872657775879, "lr": 9.690056237218814e-06, "objective/entropy": 33.29289627075195, "objective/kl": 28.069137573242188, "objective/non_score_reward": -2.8069138526916504, "objective/rlhf_reward": -9.868405782912655, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.909322738647461, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3673190474510193, "step": 485, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9988304376602173 }, { "episode": 7792, "epoch": 0.1400582377682712, "loss/policy_avg": 1.4412565231323242, "lr": 9.689417177914111e-06, "objective/entropy": 143.29071044921875, "objective/kl": 37.918006896972656, "objective/non_score_reward": -3.7918009757995605, "objective/rlhf_reward": -13.71660516700302, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 25.657358169555664, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4064646065235138, "step": 486, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9968105554580688 }, { "episode": 7808, "epoch": 0.14034583168565984, "loss/policy_avg": 0.5567857027053833, "lr": 9.688778118609408e-06, "objective/entropy": 108.03604125976562, "objective/kl": 25.368505477905273, "objective/non_score_reward": -2.5368504524230957, "objective/rlhf_reward": -8.485543017805206, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 7.421592712402344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5672407746315002, "step": 487, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002861499786377 }, { "episode": 7824, "epoch": 0.1406334256030485, "loss/policy_avg": 0.1908845454454422, "lr": 9.688139059304705e-06, "objective/entropy": 155.91831970214844, "objective/kl": 27.6815128326416, "objective/non_score_reward": -2.7681517601013184, "objective/rlhf_reward": -6.672606325149536, "objective/scores": 1.1, "policy/approxkl_avg": 10.237553596496582, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4108530282974243, "step": 488, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9994752407073975 }, { "episode": 7840, "epoch": 0.14092101952043715, "loss/policy_avg": 0.9416247606277466, "lr": 9.6875e-06, "objective/entropy": -103.95333862304688, "objective/kl": 41.44330978393555, "objective/non_score_reward": -4.144330978393555, "objective/rlhf_reward": -14.177324867248537, "objective/scores": 0.6, "policy/approxkl_avg": 14.313966751098633, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.464949369430542, "step": 489, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988256692886353 }, { "episode": 7856, "epoch": 0.1412086134378258, "loss/policy_avg": 2.2338528633117676, "lr": 9.686860940695297e-06, "objective/entropy": 47.52754211425781, "objective/kl": 42.061561584472656, "objective/non_score_reward": -4.206155776977539, "objective/rlhf_reward": -15.49911001685254, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 55.551963806152344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7129836082458496, "step": 490, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9957895278930664 }, { "episode": 7872, "epoch": 0.14149620735521443, "loss/policy_avg": 0.20792043209075928, "lr": 9.686221881390594e-06, "objective/entropy": -1.685638427734375, "objective/kl": 38.568145751953125, "objective/non_score_reward": -3.8568148612976074, "objective/rlhf_reward": -14.10174730780713, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.467138290405273, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4478898048400879, "step": 491, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003159046173096 }, { "episode": 7888, "epoch": 0.1417838012726031, "loss/policy_avg": -0.18662777543067932, "lr": 9.68558282208589e-06, "objective/entropy": -26.272117614746094, "objective/kl": 42.39691925048828, "objective/non_score_reward": -4.239691734313965, "objective/rlhf_reward": -15.633254084616823, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 55.918922424316406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5446948409080505, "step": 492, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999772548675537 }, { "episode": 7904, "epoch": 0.14207139518999173, "loss/policy_avg": 0.31151118874549866, "lr": 9.684943762781188e-06, "objective/entropy": 33.51483154296875, "objective/kl": 41.72319030761719, "objective/non_score_reward": -4.172318935394287, "objective/rlhf_reward": -15.265443642337885, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 126.68960571289062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.847740650177002, "step": 493, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983421564102173 }, { "episode": 7920, "epoch": 0.14235898910738037, "loss/policy_avg": 0.7131699323654175, "lr": 9.684304703476484e-06, "objective/entropy": -26.66382598876953, "objective/kl": 45.098487854003906, "objective/non_score_reward": -4.5098490715026855, "objective/rlhf_reward": -16.58879766902481, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 113.07894897460938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4858088493347168, "step": 494, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995558261871338 }, { "episode": 7936, "epoch": 0.142646583024769, "loss/policy_avg": 0.7710833549499512, "lr": 9.68366564417178e-06, "objective/entropy": 83.98237609863281, "objective/kl": 33.111812591552734, "objective/non_score_reward": -3.3111815452575684, "objective/rlhf_reward": -11.763773563320994, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 51.01200866699219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5449756979942322, "step": 495, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9964442253112793 }, { "episode": 7952, "epoch": 0.14293417694215768, "loss/policy_avg": 0.6315375566482544, "lr": 9.683026584867076e-06, "objective/entropy": -325.8221435546875, "objective/kl": 24.298229217529297, "objective/non_score_reward": -2.4298229217529297, "objective/rlhf_reward": -8.360041939948482, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.398147702217102, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5080133676528931, "step": 496, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000394344329834 }, { "episode": 7968, "epoch": 0.14322177085954632, "loss/policy_avg": 0.2566729485988617, "lr": 9.682387525562373e-06, "objective/entropy": -97.19512939453125, "objective/kl": 27.388530731201172, "objective/non_score_reward": -2.7388532161712646, "objective/rlhf_reward": -9.596163117621822, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 83.0306167602539, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6368144154548645, "step": 497, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983546733856201 }, { "episode": 7984, "epoch": 0.14350936477693496, "loss/policy_avg": 2.3038244247436523, "lr": 9.68174846625767e-06, "objective/entropy": -312.97418212890625, "objective/kl": 39.9110221862793, "objective/non_score_reward": -3.9911022186279297, "objective/rlhf_reward": -15.96440851688385, "objective/scores": 0.0, "policy/approxkl_avg": 229.93643188476562, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5881428718566895, "step": 498, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9984025955200195 }, { "episode": 8000, "epoch": 0.14379695869432363, "loss/policy_avg": 0.4790765643119812, "lr": 9.681109406952967e-06, "objective/entropy": 102.07373809814453, "objective/kl": 39.629451751708984, "objective/non_score_reward": -3.9629452228546143, "objective/rlhf_reward": -14.189921622694122, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 38.121917724609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5411556959152222, "step": 499, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9958930015563965 }, { "episode": 8016, "epoch": 0.14408455261171227, "loss/policy_avg": 1.2157926559448242, "lr": 9.680470347648262e-06, "objective/entropy": 91.86766052246094, "objective/kl": 41.75320053100586, "objective/non_score_reward": -4.175320148468018, "objective/rlhf_reward": -16.701281309127808, "objective/scores": 0.0, "policy/approxkl_avg": 203.22079467773438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7408411502838135, "step": 500, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977760314941406 }, { "episode": 8032, "epoch": 0.1443721465291009, "loss/policy_avg": 0.07564640045166016, "lr": 9.67983128834356e-06, "objective/entropy": -72.9471664428711, "objective/kl": 29.065643310546875, "objective/non_score_reward": -2.906564474105835, "objective/rlhf_reward": -10.247655727950434, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 7.510244369506836, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49747195839881897, "step": 501, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999770164489746 }, { "episode": 8048, "epoch": 0.14465974044648955, "loss/policy_avg": 0.7079442739486694, "lr": 9.679192229038854e-06, "objective/entropy": 44.22209930419922, "objective/kl": 47.81110382080078, "objective/non_score_reward": -4.781109809875488, "objective/rlhf_reward": -17.462581282079803, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 23.670162200927734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.42216792702674866, "step": 502, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999302864074707 }, { "episode": 8064, "epoch": 0.14494733436387822, "loss/policy_avg": 0.17914190888404846, "lr": 9.678553169734151e-06, "objective/entropy": -84.96662902832031, "objective/kl": 34.51586151123047, "objective/non_score_reward": -3.4515867233276367, "objective/rlhf_reward": -12.427744128791193, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 106.49320983886719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8383775353431702, "step": 503, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998382806777954 }, { "episode": 8080, "epoch": 0.14523492828126686, "loss/policy_avg": 1.3371942043304443, "lr": 9.677914110429448e-06, "objective/entropy": 2.8826751708984375, "objective/kl": 46.65193557739258, "objective/non_score_reward": -4.665193557739258, "objective/rlhf_reward": -17.05665400988253, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 12.464231491088867, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7261908054351807, "step": 504, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986940622329712 }, { "episode": 8096, "epoch": 0.1455225221986555, "loss/policy_avg": 0.5748533010482788, "lr": 9.677275051124745e-06, "objective/entropy": 66.49700927734375, "objective/kl": 40.18706130981445, "objective/non_score_reward": -4.018706321716309, "objective/rlhf_reward": -14.412966256559478, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.041053771972656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7998212575912476, "step": 505, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987947940826416 }, { "episode": 8112, "epoch": 0.14581011611604414, "loss/policy_avg": 0.04638584703207016, "lr": 9.676635991820042e-06, "objective/entropy": -63.98976516723633, "objective/kl": 29.01580810546875, "objective/non_score_reward": -2.901580810546875, "objective/rlhf_reward": -10.050064413753107, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.6628360748291016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5268914103507996, "step": 506, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000178098678589 }, { "episode": 8128, "epoch": 0.1460977100334328, "loss/policy_avg": 1.0773940086364746, "lr": 9.675996932515339e-06, "objective/entropy": -80.96644592285156, "objective/kl": 32.92605972290039, "objective/non_score_reward": -3.2926058769226074, "objective/rlhf_reward": -11.811174714301508, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 15.708122253417969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.707777738571167, "step": 507, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9975059032440186 }, { "episode": 8144, "epoch": 0.14638530395082144, "loss/policy_avg": 0.8820767998695374, "lr": 9.675357873210634e-06, "objective/entropy": 20.50244140625, "objective/kl": 37.39576721191406, "objective/non_score_reward": -3.739576578140259, "objective/rlhf_reward": -13.01089508362287, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 92.53317260742188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7722728252410889, "step": 508, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979910850524902 }, { "episode": 8160, "epoch": 0.14667289786821008, "loss/policy_avg": 0.33342352509498596, "lr": 9.67471881390593e-06, "objective/entropy": -130.01919555664062, "objective/kl": 37.01002502441406, "objective/non_score_reward": -3.7010021209716797, "objective/rlhf_reward": -13.142149215162384, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 111.35385131835938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7142958641052246, "step": 509, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9980767965316772 }, { "episode": 8176, "epoch": 0.14696049178559872, "loss/policy_avg": 2.0854592323303223, "lr": 9.674079754601228e-06, "objective/entropy": -153.78465270996094, "objective/kl": 37.76445770263672, "objective/non_score_reward": -3.7764456272125244, "objective/rlhf_reward": -15.105782985687256, "objective/scores": 0.0, "policy/approxkl_avg": 13.81374740600586, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4845733046531677, "step": 510, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999340295791626 }, { "episode": 8192, "epoch": 0.1472480857029874, "loss/policy_avg": 0.4099072813987732, "lr": 9.673440695296525e-06, "objective/entropy": -178.3352508544922, "objective/kl": 27.485443115234375, "objective/non_score_reward": -2.748544216156006, "objective/rlhf_reward": -10.994177103042603, "objective/scores": 0.0, "policy/approxkl_avg": 12.48928451538086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6313323974609375, "step": 511, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982385635375977 }, { "episode": 8208, "epoch": 0.14753567962037603, "loss/policy_avg": -0.27953195571899414, "lr": 9.672801635991821e-06, "objective/entropy": -270.947509765625, "objective/kl": 38.093963623046875, "objective/non_score_reward": -3.809396743774414, "objective/rlhf_reward": -13.633467230860312, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.603658676147461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6720960736274719, "step": 512, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000967502593994 }, { "episode": 8224, "epoch": 0.14782327353776467, "loss/policy_avg": 1.2482883930206299, "lr": 9.672162576687117e-06, "objective/entropy": 129.20474243164062, "objective/kl": 45.26194763183594, "objective/non_score_reward": -4.526195049285889, "objective/rlhf_reward": -16.371446386973062, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 25.947460174560547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5968309044837952, "step": 513, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9988489151000977 }, { "episode": 8240, "epoch": 0.1481108674551533, "loss/policy_avg": -0.1279393583536148, "lr": 9.671523517382413e-06, "objective/entropy": -51.24613952636719, "objective/kl": 45.18963623046875, "objective/non_score_reward": -4.518963813781738, "objective/rlhf_reward": -16.56008394935959, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.309969902038574, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7829691171646118, "step": 514, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001248598098755 }, { "episode": 8256, "epoch": 0.14839846137254198, "loss/policy_avg": 1.3355847597122192, "lr": 9.67088445807771e-06, "objective/entropy": -83.05213165283203, "objective/kl": 52.17377853393555, "objective/non_score_reward": -5.21737813949585, "objective/rlhf_reward": -17.94579282844183, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 16.227012634277344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46618372201919556, "step": 515, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993078708648682 }, { "episode": 8272, "epoch": 0.14868605528993062, "loss/policy_avg": -0.5786250829696655, "lr": 9.670245398773007e-06, "objective/entropy": 113.66342163085938, "objective/kl": 33.45789337158203, "objective/non_score_reward": -3.3457894325256348, "objective/rlhf_reward": -10.459439073444578, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.492090225219727, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 1.0281870365142822, "step": 516, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.002322196960449 }, { "episode": 8288, "epoch": 0.14897364920731926, "loss/policy_avg": 1.3281817436218262, "lr": 9.669606339468304e-06, "objective/entropy": -242.1662139892578, "objective/kl": 37.54686737060547, "objective/non_score_reward": -3.7546873092651367, "objective/rlhf_reward": -12.896042885557684, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 13.032073020935059, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6886230707168579, "step": 517, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9998258352279663 }, { "episode": 8304, "epoch": 0.1492612431247079, "loss/policy_avg": 0.027422528713941574, "lr": 9.668967280163601e-06, "objective/entropy": -86.30763244628906, "objective/kl": 30.702964782714844, "objective/non_score_reward": -3.0702965259552, "objective/rlhf_reward": -10.76541432121628, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 38.08885955810547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6287646293640137, "step": 518, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001487731933594 }, { "episode": 8320, "epoch": 0.14954883704209657, "loss/policy_avg": 0.026706572622060776, "lr": 9.668328220858896e-06, "objective/entropy": -210.8118438720703, "objective/kl": 35.477561950683594, "objective/non_score_reward": -3.5477566719055176, "objective/rlhf_reward": -12.831776821349544, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 16.58705711364746, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8783824443817139, "step": 519, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.997406244277954 }, { "episode": 8336, "epoch": 0.1498364309594852, "loss/policy_avg": -0.08051559329032898, "lr": 9.667689161554193e-06, "objective/entropy": -74.4656982421875, "objective/kl": 35.04960250854492, "objective/non_score_reward": -3.504960298538208, "objective/rlhf_reward": -12.41572085386904, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 63.21892547607422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5668095350265503, "step": 520, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998744010925293 }, { "episode": 8352, "epoch": 0.15012402487687385, "loss/policy_avg": 3.5908355712890625, "lr": 9.66705010224949e-06, "objective/entropy": -9.926849365234375, "objective/kl": 33.93809509277344, "objective/non_score_reward": -3.3938088417053223, "objective/rlhf_reward": -12.175236260890962, "objective/scores": 0.35, "policy/approxkl_avg": 7.082281112670898, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.665743350982666, "step": 521, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0017683506011963 }, { "episode": 8368, "epoch": 0.1504116187942625, "loss/policy_avg": -0.30025702714920044, "lr": 9.666411042944787e-06, "objective/entropy": -11.379417419433594, "objective/kl": 32.27046585083008, "objective/non_score_reward": -3.227046489715576, "objective/rlhf_reward": -11.392414653094944, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 32.16754913330078, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.808684229850769, "step": 522, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0002644062042236 }, { "episode": 8384, "epoch": 0.15069921271165115, "loss/policy_avg": 0.14779390394687653, "lr": 9.665771983640082e-06, "objective/entropy": -218.98468017578125, "objective/kl": 32.48244857788086, "objective/non_score_reward": -3.2482452392578125, "objective/rlhf_reward": -11.667467508345766, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 5.038860321044922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7193934917449951, "step": 523, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9989609718322754 }, { "episode": 8400, "epoch": 0.1509868066290398, "loss/policy_avg": 0.4430120587348938, "lr": 9.665132924335379e-06, "objective/entropy": -117.5274887084961, "objective/kl": 34.405799865722656, "objective/non_score_reward": -3.440580129623413, "objective/rlhf_reward": -12.158200535837727, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 42.504093170166016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5020079612731934, "step": 524, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977314472198486 }, { "episode": 8416, "epoch": 0.15127440054642843, "loss/policy_avg": 0.2200871706008911, "lr": 9.664493865030676e-06, "objective/entropy": -109.22439575195312, "objective/kl": 27.03481674194336, "objective/non_score_reward": -2.703481674194336, "objective/rlhf_reward": -9.152067666471588, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 67.23435974121094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7329505681991577, "step": 525, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9962201118469238 }, { "episode": 8432, "epoch": 0.1515619944638171, "loss/policy_avg": 0.7378959655761719, "lr": 9.663854805725971e-06, "objective/entropy": 51.754112243652344, "objective/kl": 52.813716888427734, "objective/non_score_reward": -5.281371593475342, "objective/rlhf_reward": -19.644534471447827, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 35.635292053222656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.509136438369751, "step": 526, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999948501586914 }, { "episode": 8448, "epoch": 0.15184958838120574, "loss/policy_avg": 0.12148091197013855, "lr": 9.663215746421268e-06, "objective/entropy": -194.1416015625, "objective/kl": 25.8511962890625, "objective/non_score_reward": -2.5851194858551025, "objective/rlhf_reward": -8.889879803271636, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 57.73289489746094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5093759894371033, "step": 527, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996519088745117 }, { "episode": 8464, "epoch": 0.15213718229859438, "loss/policy_avg": 0.5260336399078369, "lr": 9.662576687116565e-06, "objective/entropy": -28.452239990234375, "objective/kl": 47.280189514160156, "objective/non_score_reward": -4.728018760681152, "objective/rlhf_reward": -17.307955060068686, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.9857563972473145, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6503279209136963, "step": 528, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982560873031616 }, { "episode": 8480, "epoch": 0.15242477621598302, "loss/policy_avg": 0.35683369636535645, "lr": 9.661937627811862e-06, "objective/entropy": 73.58621978759766, "objective/kl": 32.123538970947266, "objective/non_score_reward": -3.2123541831970215, "objective/rlhf_reward": -8.449416255950927, "objective/scores": 1.1, "policy/approxkl_avg": 18.77471923828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4997715950012207, "step": 529, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996805191040039 }, { "episode": 8496, "epoch": 0.1527123701333717, "loss/policy_avg": 0.5127010345458984, "lr": 9.661298568507158e-06, "objective/entropy": -205.9234161376953, "objective/kl": 33.17967987060547, "objective/non_score_reward": -3.3179678916931152, "objective/rlhf_reward": -11.149165334478887, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 9.201175689697266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6817153692245483, "step": 530, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999847173690796 }, { "episode": 8512, "epoch": 0.15299996405076033, "loss/policy_avg": 2.6809909343719482, "lr": 9.660659509202455e-06, "objective/entropy": -134.624267578125, "objective/kl": 42.431121826171875, "objective/non_score_reward": -4.243112564086914, "objective/rlhf_reward": -15.49149716180122, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 7.862943649291992, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7814310789108276, "step": 531, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0017952919006348 }, { "episode": 8528, "epoch": 0.15328755796814897, "loss/policy_avg": 1.801469326019287, "lr": 9.66002044989775e-06, "objective/entropy": -132.70196533203125, "objective/kl": 41.52480697631836, "objective/non_score_reward": -4.152480602264404, "objective/rlhf_reward": -15.186090548236933, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 20.436891555786133, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5815787315368652, "step": 532, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002007484436035 }, { "episode": 8544, "epoch": 0.1535751518855376, "loss/policy_avg": -0.09221082925796509, "lr": 9.659381390593047e-06, "objective/entropy": -122.66864776611328, "objective/kl": 32.93227005004883, "objective/non_score_reward": -3.293226957321167, "objective/rlhf_reward": -11.439574495951334, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.255980014801025, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7247042655944824, "step": 533, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9997018575668335 }, { "episode": 8560, "epoch": 0.15386274580292628, "loss/policy_avg": 2.4887824058532715, "lr": 9.658742331288344e-06, "objective/entropy": -99.11781311035156, "objective/kl": 40.88715362548828, "objective/non_score_reward": -4.088715553283691, "objective/rlhf_reward": -16.354861974716187, "objective/scores": 0.0, "policy/approxkl_avg": 15.820581436157227, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.699001133441925, "step": 534, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.996523141860962 }, { "episode": 8576, "epoch": 0.15415033972031492, "loss/policy_avg": -0.13506712019443512, "lr": 9.658103271983641e-06, "objective/entropy": -89.46176147460938, "objective/kl": 39.85636901855469, "objective/non_score_reward": -3.9856371879577637, "objective/rlhf_reward": -14.280688291013824, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.0638363361358643, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5437813401222229, "step": 535, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0016019344329834 }, { "episode": 8592, "epoch": 0.15443793363770356, "loss/policy_avg": 0.1690143346786499, "lr": 9.657464212678938e-06, "objective/entropy": -69.37506866455078, "objective/kl": 33.3219108581543, "objective/non_score_reward": -3.332190990447998, "objective/rlhf_reward": -11.666904454649078, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.42812442779541, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7411162853240967, "step": 536, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0052967071533203 }, { "episode": 8608, "epoch": 0.1547255275550922, "loss/policy_avg": 0.6035354733467102, "lr": 9.656825153374235e-06, "objective/entropy": -19.672401428222656, "objective/kl": 43.020362854003906, "objective/non_score_reward": -4.302036285400391, "objective/rlhf_reward": -15.651886313167168, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 42.07685089111328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7375407814979553, "step": 537, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986612796783447 }, { "episode": 8624, "epoch": 0.15501312147248086, "loss/policy_avg": 0.617863655090332, "lr": 9.65618609406953e-06, "objective/entropy": -96.45539093017578, "objective/kl": 43.908935546875, "objective/non_score_reward": -4.390893936157227, "objective/rlhf_reward": -17.56357455253601, "objective/scores": 0.0, "policy/approxkl_avg": 41.87989807128906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6939387917518616, "step": 538, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990483522415161 }, { "episode": 8640, "epoch": 0.1553007153898695, "loss/policy_avg": 1.1206727027893066, "lr": 9.655547034764827e-06, "objective/entropy": -212.7060089111328, "objective/kl": 30.902385711669922, "objective/non_score_reward": -3.0902388095855713, "objective/rlhf_reward": -11.019319584875731, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 32.364540100097656, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.4254102110862732, "step": 539, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9980194568634033 }, { "episode": 8656, "epoch": 0.15558830930725814, "loss/policy_avg": 0.10584121942520142, "lr": 9.654907975460124e-06, "objective/entropy": -244.4935760498047, "objective/kl": 35.60231018066406, "objective/non_score_reward": -3.5602309703826904, "objective/rlhf_reward": -14.24092435836792, "objective/scores": 0.0, "policy/approxkl_avg": 7.080685615539551, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5694947838783264, "step": 540, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001322031021118 }, { "episode": 8672, "epoch": 0.1558759032246468, "loss/policy_avg": 0.6853758096694946, "lr": 9.65426891615542e-06, "objective/entropy": 200.61570739746094, "objective/kl": 53.1297492980957, "objective/non_score_reward": -5.31297492980957, "objective/rlhf_reward": -18.85190019607544, "objective/scores": 0.6, "policy/approxkl_avg": 14.122016906738281, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.846699595451355, "step": 541, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9972319602966309 }, { "episode": 8688, "epoch": 0.15616349714203545, "loss/policy_avg": 0.8006021976470947, "lr": 9.653629856850718e-06, "objective/entropy": -177.16116333007812, "objective/kl": 48.53944396972656, "objective/non_score_reward": -4.853944301605225, "objective/rlhf_reward": -17.965178589434966, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 88.10623931884766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5958341360092163, "step": 542, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996194839477539 }, { "episode": 8704, "epoch": 0.1564510910594241, "loss/policy_avg": 0.7566841244697571, "lr": 9.652990797546013e-06, "objective/entropy": -168.28700256347656, "objective/kl": 38.52307891845703, "objective/non_score_reward": -3.8523077964782715, "objective/rlhf_reward": -14.067595055609374, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 21.15276336669922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8955333232879639, "step": 543, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0001535415649414 }, { "episode": 8720, "epoch": 0.15673868497681273, "loss/policy_avg": 2.5178232192993164, "lr": 9.65235173824131e-06, "objective/entropy": -204.9830322265625, "objective/kl": 42.33374786376953, "objective/non_score_reward": -4.23337459564209, "objective/rlhf_reward": -15.55489716776977, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 63.29674530029297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7277956008911133, "step": 544, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9969232082366943 }, { "episode": 8736, "epoch": 0.1570262788942014, "loss/policy_avg": 0.6960093975067139, "lr": 9.651712678936605e-06, "objective/entropy": 95.6426773071289, "objective/kl": 32.65497970581055, "objective/non_score_reward": -3.265498161315918, "objective/rlhf_reward": -11.736479315787477, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 17.43046760559082, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4536857604980469, "step": 545, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972350597381592 }, { "episode": 8752, "epoch": 0.15731387281159004, "loss/policy_avg": 0.9652435779571533, "lr": 9.651073619631902e-06, "objective/entropy": 20.44597625732422, "objective/kl": 33.02486038208008, "objective/non_score_reward": -3.302485942840576, "objective/rlhf_reward": -11.0872378966966, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 30.350900650024414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7735705375671387, "step": 546, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980368614196777 }, { "episode": 8768, "epoch": 0.15760146672897868, "loss/policy_avg": 0.7415152788162231, "lr": 9.650434560327199e-06, "objective/entropy": -252.13894653320312, "objective/kl": 33.90022659301758, "objective/non_score_reward": -3.3900227546691895, "objective/rlhf_reward": -11.898231034696686, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 6.943023681640625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5897954702377319, "step": 547, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.003216028213501 }, { "episode": 8784, "epoch": 0.15788906064636732, "loss/policy_avg": 0.5711463093757629, "lr": 9.649795501022496e-06, "objective/entropy": -185.48312377929688, "objective/kl": 37.01996994018555, "objective/non_score_reward": -3.7019972801208496, "objective/rlhf_reward": -12.407988643646242, "objective/scores": 0.6, "policy/approxkl_avg": 4.724537372589111, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.563432514667511, "step": 548, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000291585922241 }, { "episode": 8800, "epoch": 0.158176654563756, "loss/policy_avg": 1.1151976585388184, "lr": 9.649156441717792e-06, "objective/entropy": -300.8952331542969, "objective/kl": 50.2194938659668, "objective/non_score_reward": -5.021949768066406, "objective/rlhf_reward": -18.531539766994072, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 10.345390319824219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5680312514305115, "step": 549, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998915672302246 }, { "episode": 8816, "epoch": 0.15846424848114463, "loss/policy_avg": -0.044495925307273865, "lr": 9.64851738241309e-06, "objective/entropy": -122.7219009399414, "objective/kl": 33.53822708129883, "objective/non_score_reward": -3.353822708129883, "objective/rlhf_reward": -11.681957976023355, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.664604902267456, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.675072193145752, "step": 550, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000653028488159 }, { "episode": 8832, "epoch": 0.15875184239853327, "loss/policy_avg": 0.5436700582504272, "lr": 9.647878323108384e-06, "objective/entropy": -333.17449951171875, "objective/kl": 26.372095108032227, "objective/non_score_reward": -2.637209892272949, "objective/rlhf_reward": -8.94471893078478, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 19.02151870727539, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7037907838821411, "step": 551, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9981114864349365 }, { "episode": 8848, "epoch": 0.1590394363159219, "loss/policy_avg": 3.402425765991211, "lr": 9.647239263803681e-06, "objective/entropy": -13.591423034667969, "objective/kl": 46.72789764404297, "objective/non_score_reward": -4.672789573669434, "objective/rlhf_reward": -17.240561823459014, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 26.33546257019043, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6139748096466064, "step": 552, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998695731163025 }, { "episode": 8864, "epoch": 0.15932703023331057, "loss/policy_avg": 1.2331594228744507, "lr": 9.646600204498978e-06, "objective/entropy": 47.64061737060547, "objective/kl": 45.18553924560547, "objective/non_score_reward": -4.518553733825684, "objective/rlhf_reward": -15.951509179846319, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 38.655860900878906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5798001289367676, "step": 553, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9974098205566406 }, { "episode": 8880, "epoch": 0.1596146241506992, "loss/policy_avg": 0.5167936086654663, "lr": 9.645961145194275e-06, "objective/entropy": -60.59259796142578, "objective/kl": 34.74770736694336, "objective/non_score_reward": -3.474771022796631, "objective/rlhf_reward": -12.074255581173013, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 62.432594299316406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5199726819992065, "step": 554, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998353719711304 }, { "episode": 8896, "epoch": 0.15990221806808785, "loss/policy_avg": 0.5368022322654724, "lr": 9.645322085889572e-06, "objective/entropy": -187.44839477539062, "objective/kl": 44.21339416503906, "objective/non_score_reward": -4.42133903503418, "objective/rlhf_reward": -16.023497348249542, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 24.31802749633789, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7579926252365112, "step": 555, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.997237205505371 }, { "episode": 8912, "epoch": 0.1601898119854765, "loss/policy_avg": 0.281308650970459, "lr": 9.644683026584867e-06, "objective/entropy": -127.6090087890625, "objective/kl": 37.9559440612793, "objective/non_score_reward": -3.7955944538116455, "objective/rlhf_reward": -13.666606032642061, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 24.68885040283203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5633901357650757, "step": 556, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994604587554932 }, { "episode": 8928, "epoch": 0.16047740590286516, "loss/policy_avg": 0.20386452972888947, "lr": 9.644043967280164e-06, "objective/entropy": -198.5346221923828, "objective/kl": 23.749149322509766, "objective/non_score_reward": -2.37491512298584, "objective/rlhf_reward": -8.140410625670834, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 4.18134069442749, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5204647183418274, "step": 557, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9984912872314453 }, { "episode": 8944, "epoch": 0.1607649998202538, "loss/policy_avg": 0.07800257205963135, "lr": 9.643404907975461e-06, "objective/entropy": -6.492671966552734, "objective/kl": 38.0936164855957, "objective/non_score_reward": -3.8093619346618652, "objective/rlhf_reward": -13.721675479205782, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 34.346961975097656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6237818002700806, "step": 558, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995702505111694 }, { "episode": 8960, "epoch": 0.16105259373764244, "loss/policy_avg": 0.3377554416656494, "lr": 9.642765848670758e-06, "objective/entropy": -133.30020141601562, "objective/kl": 43.600772857666016, "objective/non_score_reward": -4.360077857971191, "objective/rlhf_reward": -16.11479810240857, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 41.798095703125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6249016523361206, "step": 559, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9988707304000854 }, { "episode": 8976, "epoch": 0.1613401876550311, "loss/policy_avg": 0.9884511232376099, "lr": 9.642126789366055e-06, "objective/entropy": -234.08079528808594, "objective/kl": 37.608299255371094, "objective/non_score_reward": -3.7608296871185303, "objective/rlhf_reward": -10.643318510055543, "objective/scores": 1.1, "policy/approxkl_avg": 17.653762817382812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7640604376792908, "step": 560, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996306896209717 }, { "episode": 8992, "epoch": 0.16162778157241975, "loss/policy_avg": 0.6431108713150024, "lr": 9.641487730061352e-06, "objective/entropy": -74.99330139160156, "objective/kl": 41.57349395751953, "objective/non_score_reward": -4.157349109649658, "objective/rlhf_reward": -15.303883824378175, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 38.90899658203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47073087096214294, "step": 561, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977537393569946 }, { "episode": 9008, "epoch": 0.1619153754898084, "loss/policy_avg": 0.022986948490142822, "lr": 9.640848670756647e-06, "objective/entropy": 68.02236938476562, "objective/kl": 48.71097946166992, "objective/non_score_reward": -4.871097564697266, "objective/rlhf_reward": -15.084390974044801, "objective/scores": 1.1, "policy/approxkl_avg": 33.34661102294922, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.692690372467041, "step": 562, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001621723175049 }, { "episode": 9024, "epoch": 0.16220296940719703, "loss/policy_avg": 0.07986500859260559, "lr": 9.640209611451944e-06, "objective/entropy": -252.08282470703125, "objective/kl": 31.946533203125, "objective/non_score_reward": -3.1946535110473633, "objective/rlhf_reward": -10.831202576832709, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 7.246588706970215, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6767693758010864, "step": 563, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9986155033111572 }, { "episode": 9040, "epoch": 0.1624905633245857, "loss/policy_avg": 1.3177964687347412, "lr": 9.63957055214724e-06, "objective/entropy": -93.1424331665039, "objective/kl": 48.07649612426758, "objective/non_score_reward": -4.807649612426758, "objective/rlhf_reward": -17.283187220768866, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 11.541034698486328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.561279296875, "step": 564, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9975336790084839 }, { "episode": 9056, "epoch": 0.16277815724197434, "loss/policy_avg": 0.01985163800418377, "lr": 9.638931492842537e-06, "objective/entropy": -291.87255859375, "objective/kl": 36.759742736816406, "objective/non_score_reward": -3.675974130630493, "objective/rlhf_reward": -13.325294115630488, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.6155858039855957, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6654636263847351, "step": 565, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000267505645752 }, { "episode": 9072, "epoch": 0.16306575115936298, "loss/policy_avg": 0.2043217122554779, "lr": 9.638292433537834e-06, "objective/entropy": -73.8929443359375, "objective/kl": 51.33345413208008, "objective/non_score_reward": -5.133345603942871, "objective/rlhf_reward": -19.109550078113642, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 33.83651351928711, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7135969400405884, "step": 566, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998769998550415 }, { "episode": 9088, "epoch": 0.16335334507675162, "loss/policy_avg": 0.5555611848831177, "lr": 9.63765337423313e-06, "objective/entropy": -161.65342712402344, "objective/kl": 41.112884521484375, "objective/non_score_reward": -4.111289024353027, "objective/rlhf_reward": -14.711821810404459, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 42.619407653808594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5503981709480286, "step": 567, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0006415843963623 }, { "episode": 9104, "epoch": 0.16364093899414028, "loss/policy_avg": 0.822566032409668, "lr": 9.637014314928426e-06, "objective/entropy": -121.46438598632812, "objective/kl": 38.44181823730469, "objective/non_score_reward": -3.844181776046753, "objective/rlhf_reward": -13.551898355754922, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 86.43548583984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5650614500045776, "step": 568, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9973005056381226 }, { "episode": 9120, "epoch": 0.16392853291152892, "loss/policy_avg": 0.9177229404449463, "lr": 9.636375255623721e-06, "objective/entropy": -71.64779663085938, "objective/kl": 53.37040710449219, "objective/non_score_reward": -5.337040901184082, "objective/rlhf_reward": -21.348163604736328, "objective/scores": 0.0, "policy/approxkl_avg": 16.75054359436035, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7622511386871338, "step": 569, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9974229335784912 }, { "episode": 9136, "epoch": 0.16421612682891756, "loss/policy_avg": 2.550429582595825, "lr": 9.635736196319018e-06, "objective/entropy": -198.0861053466797, "objective/kl": 37.597801208496094, "objective/non_score_reward": -3.7597804069519043, "objective/rlhf_reward": -13.305787817637125, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 17.44908905029297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7178118228912354, "step": 570, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9977492094039917 }, { "episode": 9152, "epoch": 0.1645037207463062, "loss/policy_avg": 1.1576682329177856, "lr": 9.635097137014315e-06, "objective/entropy": -94.52823638916016, "objective/kl": 41.15838623046875, "objective/non_score_reward": -4.115838050842285, "objective/rlhf_reward": -12.063353157043458, "objective/scores": 1.1, "policy/approxkl_avg": 34.44091033935547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6733859181404114, "step": 571, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9955500364303589 }, { "episode": 9168, "epoch": 0.16479131466369487, "loss/policy_avg": 0.7017364501953125, "lr": 9.634458077709612e-06, "objective/entropy": -58.18111801147461, "objective/kl": 47.94506072998047, "objective/non_score_reward": -4.794506549835205, "objective/rlhf_reward": -17.818776333068293, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 5.405066013336182, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6459875106811523, "step": 572, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999786615371704 }, { "episode": 9184, "epoch": 0.1650789085810835, "loss/policy_avg": 0.6844867467880249, "lr": 9.633819018404909e-06, "objective/entropy": -182.14816284179688, "objective/kl": 37.610591888427734, "objective/non_score_reward": -3.761059284210205, "objective/rlhf_reward": -13.593638758273467, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 45.46365737915039, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5423248410224915, "step": 573, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002753734588623 }, { "episode": 9200, "epoch": 0.16536650249847215, "loss/policy_avg": 0.8669987916946411, "lr": 9.633179959100206e-06, "objective/entropy": -232.71591186523438, "objective/kl": 40.21477127075195, "objective/non_score_reward": -4.021476745605469, "objective/rlhf_reward": -14.424048905790436, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 35.74150085449219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5931271314620972, "step": 574, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9967412948608398 }, { "episode": 9216, "epoch": 0.1656540964158608, "loss/policy_avg": 1.4781056642532349, "lr": 9.632540899795501e-06, "objective/entropy": -273.8671569824219, "objective/kl": 34.78312301635742, "objective/non_score_reward": -3.4783124923706055, "objective/rlhf_reward": -12.489417989452448, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 10.15843391418457, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6542942523956299, "step": 575, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.997775673866272 }, { "episode": 9232, "epoch": 0.16594169033324946, "loss/policy_avg": 0.36914390325546265, "lr": 9.631901840490798e-06, "objective/entropy": -327.1934814453125, "objective/kl": 27.51502227783203, "objective/non_score_reward": -2.75150203704834, "objective/rlhf_reward": -9.606008923053743, "objective/scores": 0.35, "policy/approxkl_avg": 28.219341278076172, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6692686080932617, "step": 576, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0001039505004883 }, { "episode": 9248, "epoch": 0.1662292842506381, "loss/policy_avg": 0.13604627549648285, "lr": 9.631262781186095e-06, "objective/entropy": -145.65203857421875, "objective/kl": 41.72386169433594, "objective/non_score_reward": -4.172386169433594, "objective/rlhf_reward": -15.347908547430663, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 14.616079330444336, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7129019498825073, "step": 577, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000507354736328 }, { "episode": 9264, "epoch": 0.16651687816802674, "loss/policy_avg": 0.17738819122314453, "lr": 9.630623721881392e-06, "objective/entropy": -60.45104217529297, "objective/kl": 34.91204833984375, "objective/non_score_reward": -3.4912052154541016, "objective/rlhf_reward": -12.40856227180059, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.907928943634033, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5182020664215088, "step": 578, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0003273487091064 }, { "episode": 9280, "epoch": 0.1668044720854154, "loss/policy_avg": 1.5535671710968018, "lr": 9.629984662576689e-06, "objective/entropy": -110.2305679321289, "objective/kl": 39.423648834228516, "objective/non_score_reward": -3.9423651695251465, "objective/rlhf_reward": -14.41021057340948, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 16.688522338867188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.553377628326416, "step": 579, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000542402267456 }, { "episode": 9296, "epoch": 0.16709206600280405, "loss/policy_avg": -0.21501094102859497, "lr": 9.629345603271984e-06, "objective/entropy": -208.572509765625, "objective/kl": 29.07717514038086, "objective/non_score_reward": -2.90771746635437, "objective/rlhf_reward": -10.207038243015376, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 16.939083099365234, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6292991638183594, "step": 580, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005156993865967 }, { "episode": 9312, "epoch": 0.16737965992019269, "loss/policy_avg": -0.1628519594669342, "lr": 9.62870654396728e-06, "objective/entropy": -64.79794311523438, "objective/kl": 37.24529266357422, "objective/non_score_reward": -3.724529266357422, "objective/rlhf_reward": -14.898117303848267, "objective/scores": 0.0, "policy/approxkl_avg": 81.15853881835938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.37679994106292725, "step": 581, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0008161067962646 }, { "episode": 9328, "epoch": 0.16766725383758133, "loss/policy_avg": 0.1621185839176178, "lr": 9.628067484662578e-06, "objective/entropy": -256.913330078125, "objective/kl": 36.676727294921875, "objective/non_score_reward": -3.667672872543335, "objective/rlhf_reward": -13.008831744611847, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 34.5910530090332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5756609439849854, "step": 582, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9992961883544922 }, { "episode": 9344, "epoch": 0.16795484775497, "loss/policy_avg": 0.29023104906082153, "lr": 9.627428425357874e-06, "objective/entropy": -18.905437469482422, "objective/kl": 45.148834228515625, "objective/non_score_reward": -4.514883041381836, "objective/rlhf_reward": -16.57858121674812, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 30.742298126220703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8326776027679443, "step": 583, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982852935791016 }, { "episode": 9360, "epoch": 0.16824244167235863, "loss/policy_avg": 1.3382362127304077, "lr": 9.626789366053171e-06, "objective/entropy": -269.2889404296875, "objective/kl": 36.53486633300781, "objective/non_score_reward": -3.653486490249634, "objective/rlhf_reward": -10.213946199417116, "objective/scores": 1.1, "policy/approxkl_avg": 35.406089782714844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8077678680419922, "step": 584, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.99800443649292 }, { "episode": 9376, "epoch": 0.16853003558974727, "loss/policy_avg": 0.42918533086776733, "lr": 9.626150306748468e-06, "objective/entropy": 149.2460174560547, "objective/kl": 39.844512939453125, "objective/non_score_reward": -3.9844510555267334, "objective/rlhf_reward": -14.578554355834406, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 59.42961883544922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3823997676372528, "step": 585, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9983184337615967 }, { "episode": 9392, "epoch": 0.1688176295071359, "loss/policy_avg": 0.09273044764995575, "lr": 9.625511247443763e-06, "objective/entropy": -167.0006103515625, "objective/kl": 32.98309326171875, "objective/non_score_reward": -3.298309087753296, "objective/rlhf_reward": -11.833986723159237, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 101.41903686523438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.547985315322876, "step": 586, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995431900024414 }, { "episode": 9408, "epoch": 0.16910522342452458, "loss/policy_avg": 0.1386108696460724, "lr": 9.62487218813906e-06, "objective/entropy": -189.29864501953125, "objective/kl": 43.7606201171875, "objective/non_score_reward": -4.376062393188477, "objective/rlhf_reward": -16.02329647820747, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.214796543121338, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.551296591758728, "step": 587, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9979034662246704 }, { "episode": 9424, "epoch": 0.16939281734191322, "loss/policy_avg": 0.15018180012702942, "lr": 9.624233128834357e-06, "objective/entropy": -151.0711669921875, "objective/kl": 36.06580352783203, "objective/non_score_reward": -3.6065807342529297, "objective/rlhf_reward": -13.084686806708007, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 24.849681854248047, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5903939008712769, "step": 588, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9969112873077393 }, { "episode": 9440, "epoch": 0.16968041125930186, "loss/policy_avg": 0.6633468866348267, "lr": 9.623594069529654e-06, "objective/entropy": -88.4188232421875, "objective/kl": 41.29294204711914, "objective/non_score_reward": -4.129294395446777, "objective/rlhf_reward": -15.175541928320555, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 53.11321258544922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7691140174865723, "step": 589, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9956674575805664 }, { "episode": 9456, "epoch": 0.1699680051766905, "loss/policy_avg": -0.025976277887821198, "lr": 9.62295501022495e-06, "objective/entropy": -23.04672622680664, "objective/kl": 42.318519592285156, "objective/non_score_reward": -4.231852054595947, "objective/rlhf_reward": -15.371149151530815, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 17.313936233520508, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4834129214286804, "step": 590, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979733228683472 }, { "episode": 9472, "epoch": 0.17025559909407917, "loss/policy_avg": 0.35992100834846497, "lr": 9.622315950920246e-06, "objective/entropy": -135.56903076171875, "objective/kl": 37.865943908691406, "objective/non_score_reward": -3.7865943908691406, "objective/rlhf_reward": -13.787127458785456, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 34.8250732421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6128636598587036, "step": 591, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998369216918945 }, { "episode": 9488, "epoch": 0.1705431930114678, "loss/policy_avg": -0.050314195454120636, "lr": 9.621676891615543e-06, "objective/entropy": -198.44839477539062, "objective/kl": 38.32909393310547, "objective/non_score_reward": -3.832909345626831, "objective/rlhf_reward": -13.972387277816217, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.4614109992980957, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7091498374938965, "step": 592, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986573457717896 }, { "episode": 9504, "epoch": 0.17083078692885645, "loss/policy_avg": -0.00115779263433069, "lr": 9.621037832310838e-06, "objective/entropy": -143.01028442382812, "objective/kl": 32.83100509643555, "objective/non_score_reward": -3.2831003665924072, "objective/rlhf_reward": -11.184990237431464, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 8.594361305236816, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6112456321716309, "step": 593, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005526542663574 }, { "episode": 9520, "epoch": 0.1711183808462451, "loss/policy_avg": 0.5914216041564941, "lr": 9.620398773006135e-06, "objective/entropy": -15.847023010253906, "objective/kl": 41.543052673339844, "objective/non_score_reward": -4.154305458068848, "objective/rlhf_reward": -15.193389613826838, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 26.619815826416016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5229488611221313, "step": 594, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999138355255127 }, { "episode": 9536, "epoch": 0.17140597476363376, "loss/policy_avg": 0.48162180185317993, "lr": 9.619759713701432e-06, "objective/entropy": 49.441688537597656, "objective/kl": 41.245628356933594, "objective/non_score_reward": -4.124563217163086, "objective/rlhf_reward": -15.11965022334228, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 119.17011260986328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4364345073699951, "step": 595, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999006986618042 }, { "episode": 9552, "epoch": 0.1716935686810224, "loss/policy_avg": 0.13220281898975372, "lr": 9.619120654396729e-06, "objective/entropy": -217.9917755126953, "objective/kl": 41.454307556152344, "objective/non_score_reward": -4.145431041717529, "objective/rlhf_reward": -15.100772025997998, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 26.718765258789062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6766424775123596, "step": 596, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994851350784302 }, { "episode": 9568, "epoch": 0.17198116259841104, "loss/policy_avg": 0.39218467473983765, "lr": 9.618481595092026e-06, "objective/entropy": -321.5009460449219, "objective/kl": 36.74188232421875, "objective/non_score_reward": -3.6741881370544434, "objective/rlhf_reward": -13.034893279493438, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 27.68082046508789, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5078242421150208, "step": 597, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999809980392456 }, { "episode": 9584, "epoch": 0.1722687565157997, "loss/policy_avg": 0.1808081567287445, "lr": 9.617842535787323e-06, "objective/entropy": -172.8281707763672, "objective/kl": 40.136253356933594, "objective/non_score_reward": -4.013625144958496, "objective/rlhf_reward": -14.603903393359527, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 13.015371322631836, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5869624614715576, "step": 598, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999362587928772 }, { "episode": 9600, "epoch": 0.17255635043318834, "loss/policy_avg": -0.12051941454410553, "lr": 9.617203476482618e-06, "objective/entropy": -113.57295989990234, "objective/kl": 37.42548370361328, "objective/non_score_reward": -3.7425484657287598, "objective/rlhf_reward": -13.022782753186164, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 117.42220306396484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7584841847419739, "step": 599, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973876476287842 }, { "episode": 9616, "epoch": 0.17284394435057698, "loss/policy_avg": 1.5483704805374146, "lr": 9.616564417177915e-06, "objective/entropy": -355.0560302734375, "objective/kl": 39.736637115478516, "objective/non_score_reward": -3.973663568496704, "objective/rlhf_reward": -14.378882491382296, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 12.028496742248535, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7178500294685364, "step": 600, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000492572784424 }, { "episode": 9632, "epoch": 0.17313153826796562, "loss/policy_avg": 0.6728031039237976, "lr": 9.615925357873211e-06, "objective/entropy": -193.89974975585938, "objective/kl": 32.3430290222168, "objective/non_score_reward": -3.2343029975891113, "objective/rlhf_reward": -11.275351767957794, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 27.227508544921875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5896027088165283, "step": 601, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998093605041504 }, { "episode": 9648, "epoch": 0.1734191321853543, "loss/policy_avg": 2.2606358528137207, "lr": 9.615286298568508e-06, "objective/entropy": -240.98487854003906, "objective/kl": 43.463775634765625, "objective/non_score_reward": -4.346377372741699, "objective/rlhf_reward": -15.723651414335357, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 39.062442779541016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5951015949249268, "step": 602, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998093843460083 }, { "episode": 9664, "epoch": 0.17370672610274293, "loss/policy_avg": 1.1621661186218262, "lr": 9.614647239263805e-06, "objective/entropy": -128.38201904296875, "objective/kl": 49.477264404296875, "objective/non_score_reward": -4.947726249694824, "objective/rlhf_reward": -18.44927029898706, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 32.63263702392578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6886211633682251, "step": 603, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9999189376831055 }, { "episode": 9680, "epoch": 0.17399432002013157, "loss/policy_avg": 0.4888812303543091, "lr": 9.6140081799591e-06, "objective/entropy": -90.85795593261719, "objective/kl": 46.02019500732422, "objective/non_score_reward": -4.602019786834717, "objective/rlhf_reward": -16.85182079574163, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 18.202045440673828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6015419960021973, "step": 604, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99855637550354 }, { "episode": 9696, "epoch": 0.1742819139375202, "loss/policy_avg": 0.5101502537727356, "lr": 9.613369120654397e-06, "objective/entropy": -405.04302978515625, "objective/kl": 39.584861755371094, "objective/non_score_reward": -3.958486557006836, "objective/rlhf_reward": -12.910226498485777, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.033137083053589, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6735005378723145, "step": 605, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001615524291992 }, { "episode": 9712, "epoch": 0.17456950785490888, "loss/policy_avg": 1.0287859439849854, "lr": 9.612730061349694e-06, "objective/entropy": -313.8016662597656, "objective/kl": 44.508392333984375, "objective/non_score_reward": -4.450839042663574, "objective/rlhf_reward": -14.879638825298521, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 46.738216400146484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47219163179397583, "step": 606, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9983954429626465 }, { "episode": 9728, "epoch": 0.17485710177229752, "loss/policy_avg": 0.0703946053981781, "lr": 9.612091002044991e-06, "objective/entropy": -399.7001037597656, "objective/kl": 28.662792205810547, "objective/non_score_reward": -2.866279363632202, "objective/rlhf_reward": -10.105867826674862, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 54.66145706176758, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5760586261749268, "step": 607, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9973869323730469 }, { "episode": 9744, "epoch": 0.17514469568968616, "loss/policy_avg": -0.015791811048984528, "lr": 9.611451942740288e-06, "objective/entropy": -378.9763488769531, "objective/kl": 36.73283386230469, "objective/non_score_reward": -3.673283576965332, "objective/rlhf_reward": -13.089013609949667, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 12.10586929321289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6941601037979126, "step": 608, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.9997576475143433 }, { "episode": 9760, "epoch": 0.1754322896070748, "loss/policy_avg": 0.15788066387176514, "lr": 9.610812883435585e-06, "objective/entropy": -255.90493774414062, "objective/kl": 48.99408721923828, "objective/non_score_reward": -4.899409294128418, "objective/rlhf_reward": -17.86430288950602, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 5.658967971801758, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6300114393234253, "step": 609, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000362396240234 }, { "episode": 9776, "epoch": 0.17571988352446347, "loss/policy_avg": 1.4745373725891113, "lr": 9.61017382413088e-06, "objective/entropy": -92.97071838378906, "objective/kl": 54.66108322143555, "objective/non_score_reward": -5.466108322143555, "objective/rlhf_reward": -19.917022059636054, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 8.048752784729004, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5211347341537476, "step": 610, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973746538162231 }, { "episode": 9792, "epoch": 0.1760074774418521, "loss/policy_avg": -0.23048746585845947, "lr": 9.609534764826177e-06, "objective/entropy": -110.38803100585938, "objective/kl": 28.35976219177246, "objective/non_score_reward": -2.8359761238098145, "objective/rlhf_reward": -11.343904733657837, "objective/scores": 0.0, "policy/approxkl_avg": 10.715461730957031, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7749781608581543, "step": 611, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001636505126953 }, { "episode": 9808, "epoch": 0.17629507135924075, "loss/policy_avg": 2.0556282997131348, "lr": 9.608895705521472e-06, "objective/entropy": -86.34033203125, "objective/kl": 38.10757064819336, "objective/non_score_reward": -3.8107571601867676, "objective/rlhf_reward": -13.638909462753851, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 79.96404266357422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.47780632972717285, "step": 612, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981083869934082 }, { "episode": 9824, "epoch": 0.17658266527662939, "loss/policy_avg": 2.2824478149414062, "lr": 9.608256646216769e-06, "objective/entropy": -346.4453125, "objective/kl": 51.050071716308594, "objective/non_score_reward": -5.105007171630859, "objective/rlhf_reward": -16.020029401779176, "objective/scores": 1.1, "policy/approxkl_avg": 11.854001998901367, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6633209586143494, "step": 613, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998619556427002 }, { "episode": 9840, "epoch": 0.17687025919401805, "loss/policy_avg": 0.2608579397201538, "lr": 9.607617586912066e-06, "objective/entropy": -75.5977783203125, "objective/kl": 30.53676986694336, "objective/non_score_reward": -3.0536770820617676, "objective/rlhf_reward": -10.552848582685577, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 82.93180847167969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7450053691864014, "step": 614, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9989418983459473 }, { "episode": 9856, "epoch": 0.1771578531114067, "loss/policy_avg": 0.5107153654098511, "lr": 9.606978527607363e-06, "objective/entropy": -230.27703857421875, "objective/kl": 40.28911590576172, "objective/non_score_reward": -4.028911590576172, "objective/rlhf_reward": -14.665047745318756, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 18.19654083251953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7694085836410522, "step": 615, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9983909130096436 }, { "episode": 9872, "epoch": 0.17744544702879533, "loss/policy_avg": 0.14565017819404602, "lr": 9.60633946830266e-06, "objective/entropy": -214.1361083984375, "objective/kl": 16.23416519165039, "objective/non_score_reward": -1.6234164237976074, "objective/rlhf_reward": -3.5699468001138897, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.1363525390625, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.7368258237838745, "step": 616, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9986224174499512 }, { "episode": 9888, "epoch": 0.177733040946184, "loss/policy_avg": 0.916619598865509, "lr": 9.605700408997955e-06, "objective/entropy": 65.34817504882812, "objective/kl": 39.758148193359375, "objective/non_score_reward": -3.9758148193359375, "objective/rlhf_reward": -14.561623623877196, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 49.81817626953125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6418388485908508, "step": 617, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9961199760437012 }, { "episode": 9904, "epoch": 0.17802063486357264, "loss/policy_avg": 0.43867984414100647, "lr": 9.605061349693252e-06, "objective/entropy": -291.1097106933594, "objective/kl": 30.660541534423828, "objective/non_score_reward": -3.066054105758667, "objective/rlhf_reward": -7.8642161846160885, "objective/scores": 1.1, "policy/approxkl_avg": 38.47760009765625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8005675077438354, "step": 618, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0004940032958984 }, { "episode": 9920, "epoch": 0.17830822878096128, "loss/policy_avg": 0.48867934942245483, "lr": 9.604422290388548e-06, "objective/entropy": -219.26034545898438, "objective/kl": 40.27912902832031, "objective/non_score_reward": -4.0279130935668945, "objective/rlhf_reward": -14.378319756189981, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 17.053512573242188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5421030521392822, "step": 619, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990657567977905 }, { "episode": 9936, "epoch": 0.17859582269834992, "loss/policy_avg": -0.16388291120529175, "lr": 9.603783231083845e-06, "objective/entropy": -5.59771728515625, "objective/kl": 49.777130126953125, "objective/non_score_reward": -4.977713584899902, "objective/rlhf_reward": -18.551603042815607, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 13.792423248291016, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7284325957298279, "step": 620, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0001120567321777 }, { "episode": 9952, "epoch": 0.1788834166157386, "loss/policy_avg": 0.22561949491500854, "lr": 9.603144171779142e-06, "objective/entropy": -119.03425598144531, "objective/kl": 39.67613220214844, "objective/non_score_reward": -3.967613697052002, "objective/rlhf_reward": -14.528818419485717, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 15.579143524169922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46927276253700256, "step": 621, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986939430236816 }, { "episode": 9968, "epoch": 0.17917101053312723, "loss/policy_avg": -0.07936866581439972, "lr": 9.602505112474439e-06, "objective/entropy": -312.1820983886719, "objective/kl": 38.546356201171875, "objective/non_score_reward": -3.85463547706604, "objective/rlhf_reward": -13.814421925608237, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.652659893035889, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6830805540084839, "step": 622, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9996051788330078 }, { "episode": 9984, "epoch": 0.17945860445051587, "loss/policy_avg": 0.13592973351478577, "lr": 9.601866053169734e-06, "objective/entropy": -120.9598388671875, "objective/kl": 29.522968292236328, "objective/non_score_reward": -2.952296733856201, "objective/rlhf_reward": -10.409186935424804, "objective/scores": 0.35, "policy/approxkl_avg": 67.76509094238281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5022489428520203, "step": 623, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998583197593689 }, { "episode": 10000, "epoch": 0.1797461983679045, "loss/policy_avg": 0.12349797785282135, "lr": 9.601226993865031e-06, "objective/entropy": -97.76485443115234, "objective/kl": 31.539573669433594, "objective/non_score_reward": -3.1539573669433594, "objective/rlhf_reward": -11.215829467773437, "objective/scores": 0.35, "policy/approxkl_avg": 20.97466278076172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7436941862106323, "step": 624, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988412857055664 }, { "episode": 10016, "epoch": 0.18003379228529318, "loss/policy_avg": 0.24854499101638794, "lr": 9.600587934560328e-06, "objective/entropy": -142.84173583984375, "objective/kl": 37.62577819824219, "objective/non_score_reward": -3.762577533721924, "objective/rlhf_reward": -13.569357517178418, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.546608567237854, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48624539375305176, "step": 625, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000833749771118 }, { "episode": 10032, "epoch": 0.18032138620268182, "loss/policy_avg": 1.1254993677139282, "lr": 9.599948875255625e-06, "objective/entropy": -217.01876831054688, "objective/kl": 50.34384536743164, "objective/non_score_reward": -5.034384727478027, "objective/rlhf_reward": -18.758937218276362, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 56.71213150024414, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6278942823410034, "step": 626, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9996230602264404 }, { "episode": 10048, "epoch": 0.18060898012007046, "loss/policy_avg": 0.9575183391571045, "lr": 9.599309815950922e-06, "objective/entropy": -334.89495849609375, "objective/kl": 38.62886047363281, "objective/non_score_reward": -3.8628854751586914, "objective/rlhf_reward": -13.935770833285982, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 46.06591033935547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6787221431732178, "step": 627, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9963583946228027 }, { "episode": 10064, "epoch": 0.1808965740374591, "loss/policy_avg": 0.37166520953178406, "lr": 9.598670756646217e-06, "objective/entropy": -253.23953247070312, "objective/kl": 29.572723388671875, "objective/non_score_reward": -2.9572722911834717, "objective/rlhf_reward": -10.313317143710789, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 29.606956481933594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5231032371520996, "step": 628, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9965605735778809 }, { "episode": 10080, "epoch": 0.18118416795484776, "loss/policy_avg": 1.5333852767944336, "lr": 9.598031697341514e-06, "objective/entropy": -217.50408935546875, "objective/kl": 35.96586227416992, "objective/non_score_reward": -3.596586227416992, "objective/rlhf_reward": -9.986344790458679, "objective/scores": 1.1, "policy/approxkl_avg": 48.386070251464844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7713555097579956, "step": 629, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9963762760162354 }, { "episode": 10096, "epoch": 0.1814717618722364, "loss/policy_avg": 0.10991726070642471, "lr": 9.59739263803681e-06, "objective/entropy": -114.67947387695312, "objective/kl": 38.67190933227539, "objective/non_score_reward": -3.8671910762786865, "objective/rlhf_reward": -13.987811687405467, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 64.13754272460938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6923480033874512, "step": 630, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9989780187606812 }, { "episode": 10112, "epoch": 0.18175935578962504, "loss/policy_avg": 0.595374584197998, "lr": 9.596753578732108e-06, "objective/entropy": -72.69680786132812, "objective/kl": 44.85560607910156, "objective/non_score_reward": -4.485560894012451, "objective/rlhf_reward": -16.49164567431961, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 164.61390686035156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.42736077308654785, "step": 631, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984698295593262 }, { "episode": 10128, "epoch": 0.18204694970701368, "loss/policy_avg": -0.3946479856967926, "lr": 9.596114519427405e-06, "objective/entropy": 51.04241943359375, "objective/kl": 40.956275939941406, "objective/non_score_reward": -4.095627784729004, "objective/rlhf_reward": -14.9825101852417, "objective/scores": 0.35, "policy/approxkl_avg": 6.049509525299072, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6018968224525452, "step": 632, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0019989013671875 }, { "episode": 10144, "epoch": 0.18233454362440235, "loss/policy_avg": 0.049627840518951416, "lr": 9.595475460122701e-06, "objective/entropy": -355.97625732421875, "objective/kl": 14.994010925292969, "objective/non_score_reward": -1.4994010925292969, "objective/rlhf_reward": -4.4413450648456365, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 37.16582489013672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8317785263061523, "step": 633, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9989538192749023 }, { "episode": 10160, "epoch": 0.182622137541791, "loss/policy_avg": 0.5741980075836182, "lr": 9.594836400817997e-06, "objective/entropy": -140.58338928222656, "objective/kl": 38.065673828125, "objective/non_score_reward": -3.806567668914795, "objective/rlhf_reward": -13.62214973932894, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 11.47018814086914, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5414795875549316, "step": 634, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9968845844268799 }, { "episode": 10176, "epoch": 0.18290973145917963, "loss/policy_avg": 0.23386423289775848, "lr": 9.594197341513293e-06, "objective/entropy": -104.3201675415039, "objective/kl": 38.72356414794922, "objective/non_score_reward": -3.872356414794922, "objective/rlhf_reward": -15.489425420761108, "objective/scores": 0.0, "policy/approxkl_avg": 44.49272155761719, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5448688864707947, "step": 635, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986062049865723 }, { "episode": 10192, "epoch": 0.1831973253765683, "loss/policy_avg": -0.018415771424770355, "lr": 9.593558282208589e-06, "objective/entropy": -323.28582763671875, "objective/kl": 35.251190185546875, "objective/non_score_reward": -3.525118827819824, "objective/rlhf_reward": -12.36714245478312, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 9.481925010681152, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6414960622787476, "step": 636, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9988579750061035 }, { "episode": 10208, "epoch": 0.18348491929395694, "loss/policy_avg": 0.43244630098342896, "lr": 9.592919222903886e-06, "objective/entropy": 79.17412567138672, "objective/kl": 46.785804748535156, "objective/non_score_reward": -4.678580284118652, "objective/rlhf_reward": -14.314322090148927, "objective/scores": 1.1, "policy/approxkl_avg": 13.969576835632324, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5658503770828247, "step": 637, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009024143218994 }, { "episode": 10224, "epoch": 0.18377251321134558, "loss/policy_avg": 0.343896746635437, "lr": 9.592280163599182e-06, "objective/entropy": -136.99612426757812, "objective/kl": 37.76481628417969, "objective/non_score_reward": -3.7764816284179688, "objective/rlhf_reward": -13.705926513671876, "objective/scores": 0.35, "policy/approxkl_avg": 22.39147186279297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6087017059326172, "step": 638, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004689693450928 }, { "episode": 10240, "epoch": 0.18406010712873422, "loss/policy_avg": -0.11058389395475388, "lr": 9.59164110429448e-06, "objective/entropy": -101.61506652832031, "objective/kl": 42.183929443359375, "objective/non_score_reward": -4.218392372131348, "objective/rlhf_reward": -14.9261596900987, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 14.685033798217773, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6736252903938293, "step": 639, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9999792575836182 }, { "episode": 10256, "epoch": 0.18434770104612289, "loss/policy_avg": 0.0898745208978653, "lr": 9.591002044989776e-06, "objective/entropy": -113.99971008300781, "objective/kl": 40.50453186035156, "objective/non_score_reward": -4.050453186035156, "objective/rlhf_reward": -14.876299891501588, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 36.957977294921875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5056084990501404, "step": 640, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983000755310059 }, { "episode": 10272, "epoch": 0.18463529496351153, "loss/policy_avg": 0.5849778652191162, "lr": 9.590362985685071e-06, "objective/entropy": -274.3189392089844, "objective/kl": 40.9473876953125, "objective/non_score_reward": -4.094738960266113, "objective/rlhf_reward": -14.2562493703523, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.2600057125091553, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7059886455535889, "step": 641, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9991240501403809 }, { "episode": 10288, "epoch": 0.18492288888090017, "loss/policy_avg": 0.23213040828704834, "lr": 9.589723926380368e-06, "objective/entropy": -247.97145080566406, "objective/kl": 26.106571197509766, "objective/non_score_reward": -2.610656976699829, "objective/rlhf_reward": -9.018796045978632, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 64.95057678222656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6558966636657715, "step": 642, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.001206398010254 }, { "episode": 10304, "epoch": 0.1852104827982888, "loss/policy_avg": 11.370098114013672, "lr": 9.589084867075665e-06, "objective/entropy": -46.817996978759766, "objective/kl": 38.347511291503906, "objective/non_score_reward": -3.8347513675689697, "objective/rlhf_reward": -15.339005589485168, "objective/scores": 0.0, "policy/approxkl_avg": 5.018136978149414, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7419272661209106, "step": 643, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000868558883667 }, { "episode": 10320, "epoch": 0.18549807671567747, "loss/policy_avg": 0.19895608723163605, "lr": 9.588445807770962e-06, "objective/entropy": -103.27410125732422, "objective/kl": 37.708839416503906, "objective/non_score_reward": -3.7708842754364014, "objective/rlhf_reward": -13.659704764087763, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.654179334640503, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.38327789306640625, "step": 644, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9996483325958252 }, { "episode": 10336, "epoch": 0.1857856706330661, "loss/policy_avg": 0.46165961027145386, "lr": 9.587806748466259e-06, "objective/entropy": -277.8352355957031, "objective/kl": 36.63224792480469, "objective/non_score_reward": -3.663224697113037, "objective/rlhf_reward": -13.096639483180596, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 18.761638641357422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5858588218688965, "step": 645, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9983824491500854 }, { "episode": 10352, "epoch": 0.18607326455045475, "loss/policy_avg": 0.7416555881500244, "lr": 9.587167689161556e-06, "objective/entropy": -125.7095718383789, "objective/kl": 40.707427978515625, "objective/non_score_reward": -4.070743083953857, "objective/rlhf_reward": -14.904369452086787, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 79.23682403564453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5754390358924866, "step": 646, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9980629682540894 }, { "episode": 10368, "epoch": 0.1863608584678434, "loss/policy_avg": 0.17042623460292816, "lr": 9.586528629856851e-06, "objective/entropy": -272.11273193359375, "objective/kl": 37.0462646484375, "objective/non_score_reward": -3.7046265602111816, "objective/rlhf_reward": -13.394673903186884, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 27.754566192626953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7012656331062317, "step": 647, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9996850490570068 }, { "episode": 10384, "epoch": 0.18664845238523206, "loss/policy_avg": 0.7964584827423096, "lr": 9.585889570552148e-06, "objective/entropy": -128.65829467773438, "objective/kl": 36.44789123535156, "objective/non_score_reward": -3.6447887420654297, "objective/rlhf_reward": -13.25364259245984, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 22.820283889770508, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6445922255516052, "step": 648, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0019288063049316 }, { "episode": 10400, "epoch": 0.1869360463026207, "loss/policy_avg": 0.39208611845970154, "lr": 9.585250511247445e-06, "objective/entropy": -168.140625, "objective/kl": 40.124786376953125, "objective/non_score_reward": -4.012479305267334, "objective/rlhf_reward": -14.690666639541071, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 8.317047119140625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6378756761550903, "step": 649, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9976195096969604 }, { "episode": 10416, "epoch": 0.18722364022000934, "loss/policy_avg": 0.10013342648744583, "lr": 9.584611451942742e-06, "objective/entropy": -109.35111236572266, "objective/kl": 34.334266662597656, "objective/non_score_reward": -3.4334263801574707, "objective/rlhf_reward": -12.392069867163329, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.64936351776123, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5027997493743896, "step": 650, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990345239639282 }, { "episode": 10432, "epoch": 0.18751123413739798, "loss/policy_avg": -0.09684228897094727, "lr": 9.583972392638038e-06, "objective/entropy": -120.9965591430664, "objective/kl": 42.267295837402344, "objective/non_score_reward": -4.2267303466796875, "objective/rlhf_reward": -15.082091207775186, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 3.862962245941162, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5842040777206421, "step": 651, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994006156921387 }, { "episode": 10448, "epoch": 0.18779882805478665, "loss/policy_avg": 0.7214713096618652, "lr": 9.583333333333335e-06, "objective/entropy": 134.17503356933594, "objective/kl": 42.244041442871094, "objective/non_score_reward": -4.224404335021973, "objective/rlhf_reward": -15.072786922725747, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 37.499027252197266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.571992039680481, "step": 652, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9959886074066162 }, { "episode": 10464, "epoch": 0.1880864219721753, "loss/policy_avg": 1.098000407218933, "lr": 9.58269427402863e-06, "objective/entropy": 41.24781799316406, "objective/kl": 44.69629669189453, "objective/non_score_reward": -4.469630241394043, "objective/rlhf_reward": -14.954801474453184, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.596094131469727, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9781997799873352, "step": 653, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000553607940674 }, { "episode": 10480, "epoch": 0.18837401588956393, "loss/policy_avg": 0.009640902280807495, "lr": 9.582055214723927e-06, "objective/entropy": -158.4319305419922, "objective/kl": 39.49668502807617, "objective/non_score_reward": -3.9496688842773438, "objective/rlhf_reward": -14.194554600779135, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.894138336181641, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7409219741821289, "step": 654, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000908851623535 }, { "episode": 10496, "epoch": 0.1886616098069526, "loss/policy_avg": 1.1138458251953125, "lr": 9.581416155419224e-06, "objective/entropy": -360.32135009765625, "objective/kl": 37.8345947265625, "objective/non_score_reward": -3.7834596633911133, "objective/rlhf_reward": -13.683239798159942, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 22.492328643798828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6220200657844543, "step": 655, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9974379539489746 }, { "episode": 10512, "epoch": 0.18894920372434124, "loss/policy_avg": 0.20075130462646484, "lr": 9.58077709611452e-06, "objective/entropy": -133.64358520507812, "objective/kl": 45.42504119873047, "objective/non_score_reward": -4.54250431060791, "objective/rlhf_reward": -15.770017242431642, "objective/scores": 0.6, "policy/approxkl_avg": 11.763971328735352, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.645634651184082, "step": 656, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999332308769226 }, { "episode": 10528, "epoch": 0.18923679764172988, "loss/policy_avg": -0.052210867404937744, "lr": 9.580138036809816e-06, "objective/entropy": -62.48654556274414, "objective/kl": 38.83781433105469, "objective/non_score_reward": -3.883781671524048, "objective/rlhf_reward": -15.535126209259033, "objective/scores": 0.0, "policy/approxkl_avg": 57.294769287109375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5782778263092041, "step": 657, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9967520236968994 }, { "episode": 10544, "epoch": 0.18952439155911852, "loss/policy_avg": 0.23513492941856384, "lr": 9.579498977505113e-06, "objective/entropy": -327.19793701171875, "objective/kl": 38.20623779296875, "objective/non_score_reward": -3.8206238746643066, "objective/rlhf_reward": -13.858663637836543, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 53.93671417236328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6590805053710938, "step": 658, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999345302581787 }, { "episode": 10560, "epoch": 0.18981198547650718, "loss/policy_avg": -0.06993924081325531, "lr": 9.57885991820041e-06, "objective/entropy": -264.03887939453125, "objective/kl": 37.34988784790039, "objective/non_score_reward": -3.7349889278411865, "objective/rlhf_reward": -14.939955472946167, "objective/scores": 0.0, "policy/approxkl_avg": 0.5475348234176636, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5164666175842285, "step": 659, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000227928161621 }, { "episode": 10576, "epoch": 0.19009957939389582, "loss/policy_avg": 0.4225189685821533, "lr": 9.578220858895705e-06, "objective/entropy": 19.757404327392578, "objective/kl": 28.303627014160156, "objective/non_score_reward": -2.830362558364868, "objective/rlhf_reward": -8.921450233459472, "objective/scores": 0.6, "policy/approxkl_avg": 5.871613502502441, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5146780014038086, "step": 660, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982116222381592 }, { "episode": 10592, "epoch": 0.19038717331128446, "loss/policy_avg": 0.08937665820121765, "lr": 9.577581799591002e-06, "objective/entropy": 44.000144958496094, "objective/kl": 42.06709671020508, "objective/non_score_reward": -4.206709861755371, "objective/rlhf_reward": -15.164979224622833, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.199661731719971, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5933263301849365, "step": 661, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983890056610107 }, { "episode": 10608, "epoch": 0.1906747672286731, "loss/policy_avg": 0.3704705834388733, "lr": 9.576942740286299e-06, "objective/entropy": -61.501338958740234, "objective/kl": 31.85788345336914, "objective/non_score_reward": -3.185788154602051, "objective/rlhf_reward": -10.343152618408203, "objective/scores": 0.6, "policy/approxkl_avg": 26.145584106445312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7544271945953369, "step": 662, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998536229133606 }, { "episode": 10624, "epoch": 0.19096236114606177, "loss/policy_avg": 1.3078722953796387, "lr": 9.576303680981596e-06, "objective/entropy": 34.678199768066406, "objective/kl": 49.809627532958984, "objective/non_score_reward": -4.980962753295898, "objective/rlhf_reward": -18.523851490020753, "objective/scores": 0.35, "policy/approxkl_avg": 52.354454040527344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6402466297149658, "step": 663, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9968035221099854 }, { "episode": 10640, "epoch": 0.1912499550634504, "loss/policy_avg": 0.7618193626403809, "lr": 9.575664621676893e-06, "objective/entropy": 29.856212615966797, "objective/kl": 23.130603790283203, "objective/non_score_reward": -2.3130602836608887, "objective/rlhf_reward": -7.892991387580318, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.8696446418762207, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.759651243686676, "step": 664, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994723796844482 }, { "episode": 10656, "epoch": 0.19153754898083905, "loss/policy_avg": 0.08811396360397339, "lr": 9.57502556237219e-06, "objective/entropy": -262.10101318359375, "objective/kl": 41.6727294921875, "objective/non_score_reward": -4.16727352142334, "objective/rlhf_reward": -14.269093132019044, "objective/scores": 0.6, "policy/approxkl_avg": 8.957071304321289, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6705787181854248, "step": 665, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0116848945617676 }, { "episode": 10672, "epoch": 0.1918251428982277, "loss/policy_avg": 1.2480721473693848, "lr": 9.574386503067485e-06, "objective/entropy": -60.014404296875, "objective/kl": 38.03213119506836, "objective/non_score_reward": -3.803213119506836, "objective/rlhf_reward": -13.47951890627543, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 58.63201904296875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6512277722358704, "step": 666, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998128414154053 }, { "episode": 10688, "epoch": 0.19211273681561636, "loss/policy_avg": 0.5962315201759338, "lr": 9.573747443762782e-06, "objective/entropy": 33.21426010131836, "objective/kl": 37.888919830322266, "objective/non_score_reward": -3.7888917922973633, "objective/rlhf_reward": -13.813931754141478, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 41.1786994934082, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8934241533279419, "step": 667, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00087571144104 }, { "episode": 10704, "epoch": 0.192400330733005, "loss/policy_avg": 1.1563293933868408, "lr": 9.573108384458079e-06, "objective/entropy": -12.93045425415039, "objective/kl": 53.34501647949219, "objective/non_score_reward": -5.334501266479492, "objective/rlhf_reward": -19.73388603693636, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.820253372192383, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7993291616439819, "step": 668, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991778135299683 }, { "episode": 10720, "epoch": 0.19268792465039364, "loss/policy_avg": 0.4342407286167145, "lr": 9.572469325153375e-06, "objective/entropy": 140.0662078857422, "objective/kl": 49.41921615600586, "objective/non_score_reward": -4.941922187805176, "objective/rlhf_reward": -18.034354225794473, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 17.247631072998047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6062232255935669, "step": 669, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003952980041504 }, { "episode": 10736, "epoch": 0.19297551856778228, "loss/policy_avg": 0.5148516893386841, "lr": 9.571830265848672e-06, "objective/entropy": 155.82278442382812, "objective/kl": 30.684419631958008, "objective/non_score_reward": -3.068441867828369, "objective/rlhf_reward": -10.326356480793889, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.213561534881592, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6449806690216064, "step": 670, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0085437297821045 }, { "episode": 10752, "epoch": 0.19326311248517095, "loss/policy_avg": 0.2710059881210327, "lr": 9.571191206543968e-06, "objective/entropy": -380.40130615234375, "objective/kl": 27.177127838134766, "objective/non_score_reward": -2.717712879180908, "objective/rlhf_reward": -9.355079734119114, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 25.084197998046875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6170598268508911, "step": 671, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9980518817901611 }, { "episode": 10768, "epoch": 0.19355070640255959, "loss/policy_avg": 0.028690431267023087, "lr": 9.570552147239264e-06, "objective/entropy": 32.0269775390625, "objective/kl": 41.27011489868164, "objective/non_score_reward": -4.127011299133301, "objective/rlhf_reward": -12.108045673370361, "objective/scores": 1.1, "policy/approxkl_avg": 26.040042877197266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5785200595855713, "step": 672, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9978328943252563 }, { "episode": 10784, "epoch": 0.19383830031994823, "loss/policy_avg": 0.27589982748031616, "lr": 9.569913087934561e-06, "objective/entropy": -252.5802001953125, "objective/kl": 35.50873565673828, "objective/non_score_reward": -3.5508739948272705, "objective/rlhf_reward": -12.877982649832887, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 122.07984924316406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8820939064025879, "step": 673, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9997135400772095 }, { "episode": 10800, "epoch": 0.1941258942373369, "loss/policy_avg": 1.3692753314971924, "lr": 9.569274028629858e-06, "objective/entropy": -356.3477783203125, "objective/kl": 35.032447814941406, "objective/non_score_reward": -3.5032448768615723, "objective/rlhf_reward": -12.065568755345282, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 7.098365783691406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6300245523452759, "step": 674, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0019798278808594 }, { "episode": 10816, "epoch": 0.19441348815472553, "loss/policy_avg": 1.615112543106079, "lr": 9.568634969325155e-06, "objective/entropy": 15.047992706298828, "objective/kl": 31.80112075805664, "objective/non_score_reward": -3.180111885070801, "objective/rlhf_reward": -11.239495160992504, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 16.252872467041016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4491235613822937, "step": 675, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999570608139038 }, { "episode": 10832, "epoch": 0.19470108207211417, "loss/policy_avg": -0.033027857542037964, "lr": 9.567995910020452e-06, "objective/entropy": -1.499664306640625, "objective/kl": 41.21196746826172, "objective/non_score_reward": -4.12119722366333, "objective/rlhf_reward": -15.03419003924881, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.4209108352661133, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7600862383842468, "step": 676, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9999175071716309 }, { "episode": 10848, "epoch": 0.1949886759895028, "loss/policy_avg": 0.8857518434524536, "lr": 9.567356850715747e-06, "objective/entropy": -144.43075561523438, "objective/kl": 27.657447814941406, "objective/non_score_reward": -2.765744924545288, "objective/rlhf_reward": -9.684377172080378, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 104.82466125488281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7131980061531067, "step": 677, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0003392696380615 }, { "episode": 10864, "epoch": 0.19527626990689148, "loss/policy_avg": 1.639385461807251, "lr": 9.566717791411044e-06, "objective/entropy": 48.725257873535156, "objective/kl": 38.95005798339844, "objective/non_score_reward": -3.895005702972412, "objective/rlhf_reward": -14.023764221873833, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 42.96604537963867, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8440141081809998, "step": 678, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9974513053894043 }, { "episode": 10880, "epoch": 0.19556386382428012, "loss/policy_avg": 0.4775700569152832, "lr": 9.56607873210634e-06, "objective/entropy": -59.389835357666016, "objective/kl": 39.82011032104492, "objective/non_score_reward": -3.982011318206787, "objective/rlhf_reward": -13.528044319152833, "objective/scores": 0.6, "policy/approxkl_avg": 7.1817216873168945, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5918980836868286, "step": 679, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9976534843444824 }, { "episode": 10896, "epoch": 0.19585145774166876, "loss/policy_avg": 0.2459838092327118, "lr": 9.565439672801636e-06, "objective/entropy": -161.0146484375, "objective/kl": 49.84676742553711, "objective/non_score_reward": -4.984676361083984, "objective/rlhf_reward": -18.27684689086734, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 58.91631317138672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.611931324005127, "step": 680, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9979312419891357 }, { "episode": 10912, "epoch": 0.1961390516590574, "loss/policy_avg": 0.01540219783782959, "lr": 9.564800613496933e-06, "objective/entropy": -148.67662048339844, "objective/kl": 43.636253356933594, "objective/non_score_reward": -4.363625526428223, "objective/rlhf_reward": -16.075899221984248, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 23.949655532836914, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.730331301689148, "step": 681, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982978105545044 }, { "episode": 10928, "epoch": 0.19642664557644607, "loss/policy_avg": 0.015965640544891357, "lr": 9.56416155419223e-06, "objective/entropy": -276.25225830078125, "objective/kl": 38.398780822753906, "objective/non_score_reward": -3.8398780822753906, "objective/rlhf_reward": -13.959512329101564, "objective/scores": 0.35, "policy/approxkl_avg": 1.795326590538025, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.776302695274353, "step": 682, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.002492904663086 }, { "episode": 10944, "epoch": 0.1967142394938347, "loss/policy_avg": 2.442565679550171, "lr": 9.563522494887527e-06, "objective/entropy": 58.142906188964844, "objective/kl": 34.78215789794922, "objective/non_score_reward": -3.4782156944274902, "objective/rlhf_reward": -12.489030678470698, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 43.62590026855469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4453713893890381, "step": 683, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0012001991271973 }, { "episode": 10960, "epoch": 0.19700183341122335, "loss/policy_avg": 0.5201736092567444, "lr": 9.562883435582822e-06, "objective/entropy": 145.70559692382812, "objective/kl": 38.99374771118164, "objective/non_score_reward": -3.8993749618530273, "objective/rlhf_reward": -14.041240303721978, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 8.502677917480469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6028515100479126, "step": 684, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0018045902252197 }, { "episode": 10976, "epoch": 0.197289427328612, "loss/policy_avg": 0.7856461405754089, "lr": 9.562244376278119e-06, "objective/entropy": 62.90361785888672, "objective/kl": 44.264869689941406, "objective/non_score_reward": -4.426486492156982, "objective/rlhf_reward": -13.305945968627931, "objective/scores": 1.1, "policy/approxkl_avg": 5.448941230773926, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4748787581920624, "step": 685, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009329319000244 }, { "episode": 10992, "epoch": 0.19757702124600066, "loss/policy_avg": 0.051717519760131836, "lr": 9.561605316973416e-06, "objective/entropy": 18.32666015625, "objective/kl": 49.36090087890625, "objective/non_score_reward": -4.936090469360352, "objective/rlhf_reward": -18.385112726424616, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 26.29433822631836, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.41640418767929077, "step": 686, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988088607788086 }, { "episode": 11008, "epoch": 0.1978646151633893, "loss/policy_avg": 0.41617196798324585, "lr": 9.560966257668713e-06, "objective/entropy": -196.81280517578125, "objective/kl": 33.58589553833008, "objective/non_score_reward": -3.3585898876190186, "objective/rlhf_reward": -11.77250004333316, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 13.112488746643066, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5552275776863098, "step": 687, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998762607574463 }, { "episode": 11024, "epoch": 0.19815220908077794, "loss/policy_avg": 1.3362700939178467, "lr": 9.56032719836401e-06, "objective/entropy": -52.72002029418945, "objective/kl": 51.95423889160156, "objective/non_score_reward": -5.195423603057861, "objective/rlhf_reward": -19.456181798010988, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 25.351802825927734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5189340710639954, "step": 688, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992269277572632 }, { "episode": 11040, "epoch": 0.19843980299816658, "loss/policy_avg": 0.5213384628295898, "lr": 9.559688139059306e-06, "objective/entropy": -131.5908660888672, "objective/kl": 40.5286865234375, "objective/non_score_reward": -4.052868843078613, "objective/rlhf_reward": -14.264063189701972, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 6.9836015701293945, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7457672357559204, "step": 689, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9976084232330322 }, { "episode": 11056, "epoch": 0.19872739691555524, "loss/policy_avg": 0.3092210590839386, "lr": 9.559049079754601e-06, "objective/entropy": -199.13714599609375, "objective/kl": 43.78075408935547, "objective/non_score_reward": -4.378075122833252, "objective/rlhf_reward": -15.687471742900918, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 9.315277099609375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5846556425094604, "step": 690, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987812042236328 }, { "episode": 11072, "epoch": 0.19901499083294388, "loss/policy_avg": 0.19393323361873627, "lr": 9.558410020449898e-06, "objective/entropy": -96.8709716796875, "objective/kl": 47.84043502807617, "objective/non_score_reward": -4.784043312072754, "objective/rlhf_reward": -17.311345453533242, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 25.493038177490234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6347634792327881, "step": 691, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000093936920166 }, { "episode": 11088, "epoch": 0.19930258475033252, "loss/policy_avg": 0.37496620416641235, "lr": 9.557770961145195e-06, "objective/entropy": -55.891319274902344, "objective/kl": 49.187400817871094, "objective/non_score_reward": -4.9187397956848145, "objective/rlhf_reward": -17.727547953801093, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 36.81108856201172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47560715675354004, "step": 692, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989240169525146 }, { "episode": 11104, "epoch": 0.1995901786677212, "loss/policy_avg": 0.10556544363498688, "lr": 9.557131901840492e-06, "objective/entropy": 51.882606506347656, "objective/kl": 43.57653045654297, "objective/non_score_reward": -4.357653617858887, "objective/rlhf_reward": -17.43061327934265, "objective/scores": 0.0, "policy/approxkl_avg": 109.87631225585938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5153118968009949, "step": 693, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997976541519165 }, { "episode": 11120, "epoch": 0.19987777258510983, "loss/policy_avg": 0.5423169136047363, "lr": 9.556492842535789e-06, "objective/entropy": -157.76409912109375, "objective/kl": 35.16328430175781, "objective/non_score_reward": -3.5163283348083496, "objective/rlhf_reward": -12.54954107979172, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 33.95176696777344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4616202712059021, "step": 694, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994275569915771 }, { "episode": 11136, "epoch": 0.20016536650249847, "loss/policy_avg": 0.22838959097862244, "lr": 9.555853783231084e-06, "objective/entropy": -330.77996826171875, "objective/kl": 36.184104919433594, "objective/non_score_reward": -3.618410587310791, "objective/rlhf_reward": -13.049810250003901, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.890125274658203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6670582294464111, "step": 695, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9984445571899414 }, { "episode": 11152, "epoch": 0.2004529604198871, "loss/policy_avg": 0.32133978605270386, "lr": 9.555214723926381e-06, "objective/entropy": -302.5234375, "objective/kl": 42.94685745239258, "objective/non_score_reward": -4.2946858406066895, "objective/rlhf_reward": -15.622484176364495, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.337358474731445, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7116554975509644, "step": 696, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999316692352295 }, { "episode": 11168, "epoch": 0.20074055433727578, "loss/policy_avg": 0.7934830188751221, "lr": 9.554575664621678e-06, "objective/entropy": 174.83148193359375, "objective/kl": 46.936683654785156, "objective/non_score_reward": -4.693668365478516, "objective/rlhf_reward": -17.218413441386772, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.96051025390625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4638591408729553, "step": 697, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000460624694824 }, { "episode": 11184, "epoch": 0.20102814825466442, "loss/policy_avg": 1.1395208835601807, "lr": 9.553936605316975e-06, "objective/entropy": -101.63008880615234, "objective/kl": 44.84364318847656, "objective/non_score_reward": -4.4843645095825195, "objective/rlhf_reward": -17.93745756149292, "objective/scores": 0.0, "policy/approxkl_avg": 35.57691192626953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8001389503479004, "step": 698, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991685152053833 }, { "episode": 11200, "epoch": 0.20131574217205306, "loss/policy_avg": 0.9136269092559814, "lr": 9.553297546012272e-06, "objective/entropy": 75.33238220214844, "objective/kl": 40.499298095703125, "objective/non_score_reward": -4.049930095672607, "objective/rlhf_reward": -14.466386334101358, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 3.3566324710845947, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5737951993942261, "step": 699, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991101026535034 }, { "episode": 11216, "epoch": 0.2016033360894417, "loss/policy_avg": 0.15398727357387543, "lr": 9.552658486707569e-06, "objective/entropy": -117.01931762695312, "objective/kl": 45.11551284790039, "objective/non_score_reward": -4.511551856994629, "objective/rlhf_reward": -16.667604305831293, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 4.609723091125488, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5330109596252441, "step": 700, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992060661315918 }, { "episode": 11232, "epoch": 0.20189093000683037, "loss/policy_avg": 0.9212744235992432, "lr": 9.552019427402864e-06, "objective/entropy": -53.3884391784668, "objective/kl": 37.0269660949707, "objective/non_score_reward": -3.7026968002319336, "objective/rlhf_reward": -13.48527387145154, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 13.100292205810547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3935832977294922, "step": 701, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988267421722412 }, { "episode": 11248, "epoch": 0.202178523924219, "loss/policy_avg": 0.03283894807100296, "lr": 9.55138036809816e-06, "objective/entropy": -55.63243865966797, "objective/kl": 42.750938415527344, "objective/non_score_reward": -4.275094032287598, "objective/rlhf_reward": -14.176657830120298, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 40.57060241699219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5732653141021729, "step": 702, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987488985061646 }, { "episode": 11264, "epoch": 0.20246611784160765, "loss/policy_avg": 0.33969682455062866, "lr": 9.550741308793456e-06, "objective/entropy": -148.54368591308594, "objective/kl": 44.687744140625, "objective/non_score_reward": -4.468774795532227, "objective/rlhf_reward": -17.87509775161743, "objective/scores": 0.0, "policy/approxkl_avg": 39.72189712524414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8945959806442261, "step": 703, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9970632791519165 }, { "episode": 11280, "epoch": 0.20275371175899629, "loss/policy_avg": 1.481357455253601, "lr": 9.550102249488753e-06, "objective/entropy": 104.76945495605469, "objective/kl": 51.556732177734375, "objective/non_score_reward": -5.155673503875732, "objective/rlhf_reward": -19.066434710231377, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 52.006526947021484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6107035875320435, "step": 704, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983046054840088 }, { "episode": 11296, "epoch": 0.20304130567638495, "loss/policy_avg": 0.9144167900085449, "lr": 9.54946319018405e-06, "objective/entropy": 123.02681732177734, "objective/kl": 30.473539352416992, "objective/non_score_reward": -3.047353982925415, "objective/rlhf_reward": -10.765583594043818, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 23.10242462158203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8819725513458252, "step": 705, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0043578147888184 }, { "episode": 11312, "epoch": 0.2033288995937736, "loss/policy_avg": 0.05131208896636963, "lr": 9.548824130879346e-06, "objective/entropy": 10.1641845703125, "objective/kl": 39.38353729248047, "objective/non_score_reward": -3.938354015350342, "objective/rlhf_reward": -14.427902731925172, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 8.995450019836426, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8428833484649658, "step": 706, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0014190673828125 }, { "episode": 11328, "epoch": 0.20361649351116223, "loss/policy_avg": -0.4441620409488678, "lr": 9.548185071574643e-06, "objective/entropy": -176.51174926757812, "objective/kl": 46.960792541503906, "objective/non_score_reward": -4.696079254150391, "objective/rlhf_reward": -17.360484202106562, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 0.7873663306236267, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7117265462875366, "step": 707, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002187490463257 }, { "episode": 11344, "epoch": 0.20390408742855087, "loss/policy_avg": 1.6518831253051758, "lr": 9.547546012269938e-06, "objective/entropy": -109.38287353515625, "objective/kl": 40.430946350097656, "objective/non_score_reward": -4.043094635009766, "objective/rlhf_reward": -13.772377824783327, "objective/scores": 0.6, "policy/approxkl_avg": 37.53660583496094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5296775102615356, "step": 708, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980195760726929 }, { "episode": 11360, "epoch": 0.20419168134593954, "loss/policy_avg": 0.0021621547639369965, "lr": 9.546906952965235e-06, "objective/entropy": -221.2525177001953, "objective/kl": 41.13963317871094, "objective/non_score_reward": -4.113963603973389, "objective/rlhf_reward": -14.722521559397379, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 14.289579391479492, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5872384309768677, "step": 709, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0001235008239746 }, { "episode": 11376, "epoch": 0.20447927526332818, "loss/policy_avg": -0.4720730483531952, "lr": 9.546267893660532e-06, "objective/entropy": -227.78741455078125, "objective/kl": 31.431976318359375, "objective/non_score_reward": -3.143197774887085, "objective/rlhf_reward": -10.910931830824005, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.8426594734191895, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.8071937561035156, "step": 710, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0169754028320312 }, { "episode": 11392, "epoch": 0.20476686918071682, "loss/policy_avg": 1.3163057565689087, "lr": 9.545628834355829e-06, "objective/entropy": 129.99642944335938, "objective/kl": 35.20526885986328, "objective/non_score_reward": -3.5205271244049072, "objective/rlhf_reward": -14.08210825920105, "objective/scores": 0.0, "policy/approxkl_avg": 16.019872665405273, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49222731590270996, "step": 711, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999373435974121 }, { "episode": 11408, "epoch": 0.2050544630981055, "loss/policy_avg": 0.7439494132995605, "lr": 9.544989775051126e-06, "objective/entropy": -157.7192840576172, "objective/kl": 38.970420837402344, "objective/non_score_reward": -3.8970417976379395, "objective/rlhf_reward": -14.107215288098217, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 12.206151962280273, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5406340956687927, "step": 712, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999123215675354 }, { "episode": 11424, "epoch": 0.20534205701549413, "loss/policy_avg": -0.07283779978752136, "lr": 9.544350715746423e-06, "objective/entropy": -228.50950622558594, "objective/kl": 39.152793884277344, "objective/non_score_reward": -3.9152798652648926, "objective/rlhf_reward": -14.3194828539187, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 37.80413818359375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6945517659187317, "step": 713, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001655101776123 }, { "episode": 11440, "epoch": 0.20562965093288277, "loss/policy_avg": -0.2732037901878357, "lr": 9.543711656441718e-06, "objective/entropy": -12.787384033203125, "objective/kl": 35.518089294433594, "objective/non_score_reward": -3.5518088340759277, "objective/rlhf_reward": -14.207236051559448, "objective/scores": 0.0, "policy/approxkl_avg": 8.075851440429688, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.48810243606567383, "step": 714, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0166308879852295 }, { "episode": 11456, "epoch": 0.2059172448502714, "loss/policy_avg": 0.43229636549949646, "lr": 9.543072597137015e-06, "objective/entropy": -66.50861358642578, "objective/kl": 45.58790969848633, "objective/non_score_reward": -4.558791160583496, "objective/rlhf_reward": -15.311445151210997, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.3843135833740234, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6485980749130249, "step": 715, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0008926391601562 }, { "episode": 11472, "epoch": 0.20620483876766008, "loss/policy_avg": 0.8147934079170227, "lr": 9.542433537832312e-06, "objective/entropy": -161.17880249023438, "objective/kl": 46.83830261230469, "objective/non_score_reward": -4.683830738067627, "objective/rlhf_reward": -16.612616243139776, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 18.456661224365234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5038250684738159, "step": 716, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985175132751465 }, { "episode": 11488, "epoch": 0.20649243268504872, "loss/policy_avg": 0.2672369182109833, "lr": 9.541794478527609e-06, "objective/entropy": -305.15264892578125, "objective/kl": 38.918190002441406, "objective/non_score_reward": -3.8918187618255615, "objective/rlhf_reward": -13.90541625541507, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 23.703933715820312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6772634983062744, "step": 717, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990991353988647 }, { "episode": 11504, "epoch": 0.20678002660243736, "loss/policy_avg": 0.6788185238838196, "lr": 9.541155419222906e-06, "objective/entropy": -22.62152862548828, "objective/kl": 48.63094711303711, "objective/non_score_reward": -4.863094329833984, "objective/rlhf_reward": -18.052378273010255, "objective/scores": 0.35, "policy/approxkl_avg": 66.53858184814453, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5422554016113281, "step": 718, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975666999816895 }, { "episode": 11520, "epoch": 0.207067620519826, "loss/policy_avg": 1.2419401407241821, "lr": 9.5405163599182e-06, "objective/entropy": -44.057044982910156, "objective/kl": 47.82793426513672, "objective/non_score_reward": -4.7827935218811035, "objective/rlhf_reward": -17.70734198828515, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 63.91344451904297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4672902226448059, "step": 719, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9976657629013062 }, { "episode": 11536, "epoch": 0.20735521443721466, "loss/policy_avg": 0.23812846839427948, "lr": 9.539877300613498e-06, "objective/entropy": 14.730127334594727, "objective/kl": 36.89497375488281, "objective/non_score_reward": -3.689497470855713, "objective/rlhf_reward": -13.15386942392977, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 64.03750610351562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5559313297271729, "step": 720, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9973509311676025 }, { "episode": 11552, "epoch": 0.2076428083546033, "loss/policy_avg": 2.798316478729248, "lr": 9.539238241308795e-06, "objective/entropy": -49.72710418701172, "objective/kl": 45.801639556884766, "objective/non_score_reward": -4.580163955688477, "objective/rlhf_reward": -16.920656538009645, "objective/scores": 0.35, "policy/approxkl_avg": 15.340538024902344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6497080326080322, "step": 721, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989633560180664 }, { "episode": 11568, "epoch": 0.20793040227199194, "loss/policy_avg": 0.8752083778381348, "lr": 9.538599182004091e-06, "objective/entropy": -125.80049133300781, "objective/kl": 32.64875793457031, "objective/non_score_reward": -3.264875888824463, "objective/rlhf_reward": -11.733989987403078, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 67.2957534790039, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5858502984046936, "step": 722, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998004674911499 }, { "episode": 11584, "epoch": 0.20821799618938058, "loss/policy_avg": -0.8605256080627441, "lr": 9.537960122699387e-06, "objective/entropy": 32.59730529785156, "objective/kl": 46.67155075073242, "objective/non_score_reward": -4.667154788970947, "objective/rlhf_reward": -18.66861915588379, "objective/scores": 0.0, "policy/approxkl_avg": 139.18896484375, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.39150160551071167, "step": 723, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0014235973358154 }, { "episode": 11600, "epoch": 0.20850559010676925, "loss/policy_avg": 1.036819577217102, "lr": 9.537321063394683e-06, "objective/entropy": -182.33963012695312, "objective/kl": 40.85021209716797, "objective/non_score_reward": -4.085021018981934, "objective/rlhf_reward": -14.99844866087976, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 7.50022554397583, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5750303268432617, "step": 724, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982917308807373 }, { "episode": 11616, "epoch": 0.2087931840241579, "loss/policy_avg": 0.3512716591358185, "lr": 9.53668200408998e-06, "objective/entropy": -271.77294921875, "objective/kl": 46.54193878173828, "objective/non_score_reward": -4.654193878173828, "objective/rlhf_reward": -15.693056498409483, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 35.207305908203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5233364105224609, "step": 725, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9976844787597656 }, { "episode": 11632, "epoch": 0.20908077794154653, "loss/policy_avg": -0.24283993244171143, "lr": 9.536042944785277e-06, "objective/entropy": 59.02741241455078, "objective/kl": 31.218732833862305, "objective/non_score_reward": -3.121873378753662, "objective/rlhf_reward": -9.56377426231024, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 13.003084182739258, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9299765825271606, "step": 726, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005626678466797 }, { "episode": 11648, "epoch": 0.20936837185893517, "loss/policy_avg": 1.312835931777954, "lr": 9.535403885480572e-06, "objective/entropy": -175.0522918701172, "objective/kl": 53.506343841552734, "objective/non_score_reward": -5.3506340980529785, "objective/rlhf_reward": -19.95193825206314, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 26.03130340576172, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7993600368499756, "step": 727, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9979145526885986 }, { "episode": 11664, "epoch": 0.20965596577632384, "loss/policy_avg": 2.228332042694092, "lr": 9.53476482617587e-06, "objective/entropy": -13.038238525390625, "objective/kl": 29.668231964111328, "objective/non_score_reward": -2.9668235778808594, "objective/rlhf_reward": -8.943575863481733, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 58.691162109375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5734894275665283, "step": 728, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984681606292725 }, { "episode": 11680, "epoch": 0.20994355969371248, "loss/policy_avg": 1.2234901189804077, "lr": 9.534125766871166e-06, "objective/entropy": -52.32987594604492, "objective/kl": 42.713897705078125, "objective/non_score_reward": -4.271389961242676, "objective/rlhf_reward": -15.481440577570517, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 105.13751220703125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6829936504364014, "step": 729, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984365701675415 }, { "episode": 11696, "epoch": 0.21023115361110112, "loss/policy_avg": 0.6054737567901611, "lr": 9.533486707566463e-06, "objective/entropy": -4.811004638671875, "objective/kl": 38.744468688964844, "objective/non_score_reward": -3.8744468688964844, "objective/rlhf_reward": -13.893667016092856, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 13.6554536819458, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5291285514831543, "step": 730, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9962334632873535 }, { "episode": 11712, "epoch": 0.21051874752848979, "loss/policy_avg": 0.2636476159095764, "lr": 9.53284764826176e-06, "objective/entropy": -290.16064453125, "objective/kl": 34.78122329711914, "objective/non_score_reward": -3.4781219959259033, "objective/rlhf_reward": -12.431535365994336, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.398682594299316, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.40512436628341675, "step": 731, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987027645111084 }, { "episode": 11728, "epoch": 0.21080634144587843, "loss/policy_avg": 0.08485618978738785, "lr": 9.532208588957055e-06, "objective/entropy": 4.361198425292969, "objective/kl": 48.85911178588867, "objective/non_score_reward": -4.88591194152832, "objective/rlhf_reward": -18.06269371789253, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 24.168426513671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7394464015960693, "step": 732, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998630166053772 }, { "episode": 11744, "epoch": 0.21109393536326707, "loss/policy_avg": 0.6081631183624268, "lr": 9.531569529652352e-06, "objective/entropy": -193.95896911621094, "objective/kl": 41.25489044189453, "objective/non_score_reward": -4.125488758087158, "objective/rlhf_reward": -14.101954555511476, "objective/scores": 0.6, "policy/approxkl_avg": 10.96660041809082, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4481828808784485, "step": 733, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9984350204467773 }, { "episode": 11760, "epoch": 0.2113815292806557, "loss/policy_avg": 0.1706855297088623, "lr": 9.530930470347649e-06, "objective/entropy": -283.3249816894531, "objective/kl": 36.63468933105469, "objective/non_score_reward": -3.663468599319458, "objective/rlhf_reward": -13.275272228804928, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.8939387798309326, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6289054155349731, "step": 734, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9998714923858643 }, { "episode": 11776, "epoch": 0.21166912319804437, "loss/policy_avg": 0.006064563989639282, "lr": 9.530291411042946e-06, "objective/entropy": -281.0845947265625, "objective/kl": 40.41436767578125, "objective/non_score_reward": -4.041437149047852, "objective/rlhf_reward": -14.340918655666421, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.309089660644531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9609556794166565, "step": 735, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.001737594604492 }, { "episode": 11792, "epoch": 0.211956717115433, "loss/policy_avg": 0.9804132580757141, "lr": 9.529652351738243e-06, "objective/entropy": -149.67555236816406, "objective/kl": 33.81299591064453, "objective/non_score_reward": -3.3812994956970215, "objective/rlhf_reward": -12.009426915439303, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 18.63791275024414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8183978199958801, "step": 736, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984934329986572 }, { "episode": 11808, "epoch": 0.21224431103282165, "loss/policy_avg": 0.8149501085281372, "lr": 9.52901329243354e-06, "objective/entropy": -135.65036010742188, "objective/kl": 38.22229766845703, "objective/non_score_reward": -3.8222296237945557, "objective/rlhf_reward": -13.94728236487451, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 56.22487258911133, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49448803067207336, "step": 737, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000056743621826 }, { "episode": 11824, "epoch": 0.2125319049502103, "loss/policy_avg": 0.9543494582176208, "lr": 9.528374233128835e-06, "objective/entropy": -111.98321533203125, "objective/kl": 40.26172637939453, "objective/non_score_reward": -4.026172637939453, "objective/rlhf_reward": -14.680858810146418, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.283975601196289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5840786695480347, "step": 738, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993596076965332 }, { "episode": 11840, "epoch": 0.21281949886759896, "loss/policy_avg": -0.20723594725131989, "lr": 9.527735173824132e-06, "objective/entropy": 9.0609130859375, "objective/kl": 34.74554443359375, "objective/non_score_reward": -3.4745540618896484, "objective/rlhf_reward": -12.447618345828399, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.519084930419922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7199011445045471, "step": 739, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001832962036133 }, { "episode": 11856, "epoch": 0.2131070927849876, "loss/policy_avg": -0.20900213718414307, "lr": 9.527096114519428e-06, "objective/entropy": -103.92984008789062, "objective/kl": 39.744930267333984, "objective/non_score_reward": -3.9744927883148193, "objective/rlhf_reward": -14.447373251529083, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 32.00751495361328, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.520116925239563, "step": 740, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0052452087402344 }, { "episode": 11872, "epoch": 0.21339468670237624, "loss/policy_avg": 0.3564949631690979, "lr": 9.526457055214725e-06, "objective/entropy": -143.9356689453125, "objective/kl": 39.302146911621094, "objective/non_score_reward": -3.9302148818969727, "objective/rlhf_reward": -13.896030183109353, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 41.78466033935547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.752386748790741, "step": 741, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0004146099090576 }, { "episode": 11888, "epoch": 0.21368228061976488, "loss/policy_avg": 0.565791130065918, "lr": 9.525817995910022e-06, "objective/entropy": -147.33612060546875, "objective/kl": 42.210853576660156, "objective/non_score_reward": -4.221085548400879, "objective/rlhf_reward": -15.222481732786285, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 32.17230224609375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7177502512931824, "step": 742, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988508224487305 }, { "episode": 11904, "epoch": 0.21396987453715355, "loss/policy_avg": 0.9356632232666016, "lr": 9.525178936605317e-06, "objective/entropy": -138.57948303222656, "objective/kl": 25.448646545410156, "objective/non_score_reward": -2.5448646545410156, "objective/rlhf_reward": -8.800856449691159, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 29.832378387451172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8654073476791382, "step": 743, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99726140499115 }, { "episode": 11920, "epoch": 0.2142574684545422, "loss/policy_avg": 1.411858081817627, "lr": 9.524539877300614e-06, "objective/entropy": -45.33397674560547, "objective/kl": 47.14906311035156, "objective/non_score_reward": -4.714906692504883, "objective/rlhf_reward": -14.459626293182374, "objective/scores": 1.1, "policy/approxkl_avg": 48.37995529174805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4117211699485779, "step": 744, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974520206451416 }, { "episode": 11936, "epoch": 0.21454506237193083, "loss/policy_avg": 0.2626647353172302, "lr": 9.52390081799591e-06, "objective/entropy": -90.43316650390625, "objective/kl": 45.553260803222656, "objective/non_score_reward": -4.555326461791992, "objective/rlhf_reward": -15.297585879207823, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.135089874267578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5962470173835754, "step": 745, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000563383102417 }, { "episode": 11952, "epoch": 0.21483265628931947, "loss/policy_avg": 0.709905743598938, "lr": 9.523261758691206e-06, "objective/entropy": -42.35087203979492, "objective/kl": 41.68510055541992, "objective/non_score_reward": -4.168510437011719, "objective/rlhf_reward": -12.274040555953981, "objective/scores": 1.1, "policy/approxkl_avg": 26.280433654785156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8072315454483032, "step": 746, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989625215530396 }, { "episode": 11968, "epoch": 0.21512025020670814, "loss/policy_avg": -0.6861017942428589, "lr": 9.522622699386503e-06, "objective/entropy": 90.19775390625, "objective/kl": 53.37855911254883, "objective/non_score_reward": -5.337856292724609, "objective/rlhf_reward": -19.97282347926269, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.6925954818725586, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4838978350162506, "step": 747, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0017528533935547 }, { "episode": 11984, "epoch": 0.21540784412409678, "loss/policy_avg": 1.1363091468811035, "lr": 9.5219836400818e-06, "objective/entropy": -8.687484741210938, "objective/kl": 51.46209716796875, "objective/non_score_reward": -5.146209716796875, "objective/rlhf_reward": -19.22558971617071, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 25.820655822753906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8138981461524963, "step": 748, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985830783843994 }, { "episode": 12000, "epoch": 0.21569543804148542, "loss/policy_avg": 0.04534798115491867, "lr": 9.521344580777097e-06, "objective/entropy": -280.7192687988281, "objective/kl": 37.07057189941406, "objective/non_score_reward": -3.7070577144622803, "objective/rlhf_reward": -13.404398520191279, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 47.663856506347656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9239341020584106, "step": 749, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9996497631072998 }, { "episode": 12016, "epoch": 0.21598303195887408, "loss/policy_avg": 0.0637999027967453, "lr": 9.520705521472394e-06, "objective/entropy": 149.28018188476562, "objective/kl": 54.167938232421875, "objective/non_score_reward": -5.4167938232421875, "objective/rlhf_reward": -20.307925426696222, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 45.464962005615234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7254760265350342, "step": 750, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995579719543457 }, { "episode": 12032, "epoch": 0.21627062587626272, "loss/policy_avg": 1.039564609527588, "lr": 9.520066462167689e-06, "objective/entropy": -83.41825866699219, "objective/kl": 43.81858825683594, "objective/non_score_reward": -4.381858825683594, "objective/rlhf_reward": -16.011662566455538, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 29.998737335205078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6821025609970093, "step": 751, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9987900257110596 }, { "episode": 12048, "epoch": 0.21655821979365136, "loss/policy_avg": 0.37849336862564087, "lr": 9.519427402862986e-06, "objective/entropy": 14.265073776245117, "objective/kl": 56.944114685058594, "objective/non_score_reward": -5.694411277770996, "objective/rlhf_reward": -20.830234358982977, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.6588432788848877, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6893448233604431, "step": 752, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000504493713379 }, { "episode": 12064, "epoch": 0.21684581371104, "loss/policy_avg": 2.0812063217163086, "lr": 9.518788343558283e-06, "objective/entropy": 12.610939025878906, "objective/kl": 42.907318115234375, "objective/non_score_reward": -4.290732383728027, "objective/rlhf_reward": -15.837415728598756, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 65.95945739746094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.38551682233810425, "step": 753, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998692274093628 }, { "episode": 12080, "epoch": 0.21713340762842867, "loss/policy_avg": 1.4952073097229004, "lr": 9.51814928425358e-06, "objective/entropy": 51.34489440917969, "objective/kl": 35.404083251953125, "objective/non_score_reward": -3.540408134460449, "objective/rlhf_reward": -12.783030369368891, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.171011924743652, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8167620301246643, "step": 754, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997989535331726 }, { "episode": 12096, "epoch": 0.2174210015458173, "loss/policy_avg": 0.1273561418056488, "lr": 9.517510224948877e-06, "objective/entropy": -277.29986572265625, "objective/kl": 29.701295852661133, "objective/non_score_reward": -2.9701294898986816, "objective/rlhf_reward": -10.538882306128173, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 26.458465576171875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5248850584030151, "step": 755, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987549781799316 }, { "episode": 12112, "epoch": 0.21770859546320595, "loss/policy_avg": 0.4003949761390686, "lr": 9.516871165644172e-06, "objective/entropy": -68.40557861328125, "objective/kl": 39.301795959472656, "objective/non_score_reward": -3.9301795959472656, "objective/rlhf_reward": -14.320718860626222, "objective/scores": 0.35, "policy/approxkl_avg": 2.6001267433166504, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5238065719604492, "step": 756, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9997713565826416 }, { "episode": 12128, "epoch": 0.2179961893805946, "loss/policy_avg": 0.18117927014827728, "lr": 9.516232106339469e-06, "objective/entropy": -258.4045715332031, "objective/kl": 43.05773162841797, "objective/non_score_reward": -4.3057732582092285, "objective/rlhf_reward": -15.707321727069552, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 51.23970031738281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4997139275074005, "step": 757, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000267028808594 }, { "episode": 12144, "epoch": 0.21828378329798326, "loss/policy_avg": 0.421768456697464, "lr": 9.515593047034765e-06, "objective/entropy": -100.52609252929688, "objective/kl": 46.79566192626953, "objective/non_score_reward": -4.679566383361816, "objective/rlhf_reward": -16.89343583134086, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 170.1147003173828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3974299132823944, "step": 758, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999988317489624 }, { "episode": 12160, "epoch": 0.2185713772153719, "loss/policy_avg": 0.6913712024688721, "lr": 9.514953987730062e-06, "objective/entropy": 156.790283203125, "objective/kl": 52.020015716552734, "objective/non_score_reward": -5.202001571655273, "objective/rlhf_reward": -17.884287272335264, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.406476974487305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6548702716827393, "step": 759, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998046636581421 }, { "episode": 12176, "epoch": 0.21885897113276054, "loss/policy_avg": 1.3135895729064941, "lr": 9.51431492842536e-06, "objective/entropy": -228.68115234375, "objective/kl": 26.70990562438965, "objective/non_score_reward": -2.6709907054901123, "objective/rlhf_reward": -9.342326930075316, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 11.945259094238281, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6857779026031494, "step": 760, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0025386810302734 }, { "episode": 12192, "epoch": 0.21914656505014918, "loss/policy_avg": 2.0080392360687256, "lr": 9.513675869120656e-06, "objective/entropy": -90.80921936035156, "objective/kl": 38.32233428955078, "objective/non_score_reward": -3.832233190536499, "objective/rlhf_reward": -13.969682180617731, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 11.5086669921875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6320334076881409, "step": 761, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999190330505371 }, { "episode": 12208, "epoch": 0.21943415896753785, "loss/policy_avg": -0.11610303819179535, "lr": 9.513036809815951e-06, "objective/entropy": -58.6525764465332, "objective/kl": 38.49602508544922, "objective/non_score_reward": -3.849602222442627, "objective/rlhf_reward": -14.056773713141112, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.3055446147918701, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4994346499443054, "step": 762, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999125003814697 }, { "episode": 12224, "epoch": 0.21972175288492649, "loss/policy_avg": 1.3697398900985718, "lr": 9.512397750511248e-06, "objective/entropy": -144.068359375, "objective/kl": 47.98112106323242, "objective/non_score_reward": -4.798112392425537, "objective/rlhf_reward": -17.069743337408575, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 17.99888801574707, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6316829919815063, "step": 763, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998345971107483 }, { "episode": 12240, "epoch": 0.22000934680231513, "loss/policy_avg": 0.1900498867034912, "lr": 9.511758691206545e-06, "objective/entropy": 36.60948181152344, "objective/kl": 38.48204803466797, "objective/non_score_reward": -3.8482046127319336, "objective/rlhf_reward": -10.99281940460205, "objective/scores": 1.1, "policy/approxkl_avg": 25.719863891601562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7167200446128845, "step": 764, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998880386352539 }, { "episode": 12256, "epoch": 0.22029694071970377, "loss/policy_avg": 0.38484030961990356, "lr": 9.511119631901842e-06, "objective/entropy": -45.415122985839844, "objective/kl": 43.59566879272461, "objective/non_score_reward": -4.359567165374756, "objective/rlhf_reward": -16.11275533202283, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.580883026123047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4097760319709778, "step": 765, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987237453460693 }, { "episode": 12272, "epoch": 0.22058453463709243, "loss/policy_avg": 0.376749187707901, "lr": 9.510480572597139e-06, "objective/entropy": -264.5468444824219, "objective/kl": 40.1629638671875, "objective/non_score_reward": -4.01629638671875, "objective/rlhf_reward": -14.66518578529358, "objective/scores": 0.35, "policy/approxkl_avg": 10.952293395996094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.788668155670166, "step": 766, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9980183839797974 }, { "episode": 12288, "epoch": 0.22087212855448107, "loss/policy_avg": 0.8714499473571777, "lr": 9.509841513292434e-06, "objective/entropy": -187.86923217773438, "objective/kl": 65.75520324707031, "objective/non_score_reward": -6.5755205154418945, "objective/rlhf_reward": -24.942833626006525, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 13.150325775146484, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5890084505081177, "step": 767, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9972370862960815 }, { "episode": 12304, "epoch": 0.2211597224718697, "loss/policy_avg": 0.38202425837516785, "lr": 9.509202453987731e-06, "objective/entropy": -204.2275848388672, "objective/kl": 45.88398742675781, "objective/non_score_reward": -4.5883989334106445, "objective/rlhf_reward": -15.953595256805421, "objective/scores": 0.6, "policy/approxkl_avg": 98.07376098632812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7801451086997986, "step": 768, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9973288774490356 }, { "episode": 12320, "epoch": 0.22144731638925838, "loss/policy_avg": 0.6291148066520691, "lr": 9.508563394683026e-06, "objective/entropy": -30.971599578857422, "objective/kl": 47.970767974853516, "objective/non_score_reward": -4.797077178955078, "objective/rlhf_reward": -17.672535979541475, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 19.533828735351562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6176853775978088, "step": 769, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993345737457275 }, { "episode": 12336, "epoch": 0.22173491030664702, "loss/policy_avg": 0.48620525002479553, "lr": 9.507924335378323e-06, "objective/entropy": -275.8260803222656, "objective/kl": 35.916908264160156, "objective/non_score_reward": -3.591691017150879, "objective/rlhf_reward": -12.76264480120333, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 11.286559104919434, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6079502105712891, "step": 770, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9984924793243408 }, { "episode": 12352, "epoch": 0.22202250422403566, "loss/policy_avg": 1.610249638557434, "lr": 9.50728527607362e-06, "objective/entropy": 54.72114562988281, "objective/kl": 56.84210205078125, "objective/non_score_reward": -5.684210300445557, "objective/rlhf_reward": -21.41132882598035, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 37.109683990478516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6080986857414246, "step": 771, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999401569366455 }, { "episode": 12368, "epoch": 0.2223100981414243, "loss/policy_avg": -0.243692547082901, "lr": 9.506646216768917e-06, "objective/entropy": -64.7589340209961, "objective/kl": 49.16008758544922, "objective/non_score_reward": -4.916008949279785, "objective/rlhf_reward": -18.285433867064814, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 144.7684326171875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5061931610107422, "step": 772, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989681243896484 }, { "episode": 12384, "epoch": 0.22259769205881297, "loss/policy_avg": 0.5371442437171936, "lr": 9.506007157464214e-06, "objective/entropy": -145.7498779296875, "objective/kl": 36.19651412963867, "objective/non_score_reward": -3.6196513175964355, "objective/rlhf_reward": -12.962834203036959, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.433561325073242, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7705793380737305, "step": 773, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.997005581855774 }, { "episode": 12400, "epoch": 0.2228852859762016, "loss/policy_avg": 0.8214170336723328, "lr": 9.50536809815951e-06, "objective/entropy": -230.78439331054688, "objective/kl": 45.08775329589844, "objective/non_score_reward": -4.50877571105957, "objective/rlhf_reward": -16.210273618969033, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 45.08744812011719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6854566335678101, "step": 774, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9984679222106934 }, { "episode": 12416, "epoch": 0.22317287989359025, "loss/policy_avg": 0.32185256481170654, "lr": 9.504729038854806e-06, "objective/entropy": 17.611534118652344, "objective/kl": 39.12417984008789, "objective/non_score_reward": -3.9124178886413574, "objective/rlhf_reward": -13.824842984947274, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 23.315509796142578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.719410240650177, "step": 775, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994924068450928 }, { "episode": 12432, "epoch": 0.2234604738109789, "loss/policy_avg": 0.8856257200241089, "lr": 9.504089979550103e-06, "objective/entropy": -192.63497924804688, "objective/kl": 46.621360778808594, "objective/non_score_reward": -4.662136077880859, "objective/rlhf_reward": -18.648544788360596, "objective/scores": 0.0, "policy/approxkl_avg": 7.01698637008667, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47388628125190735, "step": 776, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977457523345947 }, { "episode": 12448, "epoch": 0.22374806772836756, "loss/policy_avg": -0.10767285525798798, "lr": 9.5034509202454e-06, "objective/entropy": -169.90748596191406, "objective/kl": 40.091983795166016, "objective/non_score_reward": -4.009198188781738, "objective/rlhf_reward": -14.432673726145346, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.653290271759033, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4810905158519745, "step": 777, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0010921955108643 }, { "episode": 12464, "epoch": 0.2240356616457562, "loss/policy_avg": 0.5116205215454102, "lr": 9.502811860940696e-06, "objective/entropy": 17.38312530517578, "objective/kl": 46.06462097167969, "objective/non_score_reward": -4.606462478637695, "objective/rlhf_reward": -17.066599333022516, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 26.975656509399414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3496581017971039, "step": 778, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981449842453003 }, { "episode": 12480, "epoch": 0.22432325556314484, "loss/policy_avg": -0.053467996418476105, "lr": 9.502172801635993e-06, "objective/entropy": -109.83203125, "objective/kl": 53.02067565917969, "objective/non_score_reward": -5.302067756652832, "objective/rlhf_reward": -19.88275865080945, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.783937454223633, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5046157836914062, "step": 779, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00018310546875 }, { "episode": 12496, "epoch": 0.22461084948053348, "loss/policy_avg": 0.20323413610458374, "lr": 9.50153374233129e-06, "objective/entropy": -75.36346435546875, "objective/kl": 52.81346893310547, "objective/non_score_reward": -5.281346797943115, "objective/rlhf_reward": -16.725387191772462, "objective/scores": 1.1, "policy/approxkl_avg": 22.060535430908203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.367484986782074, "step": 780, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009407997131348 }, { "episode": 12512, "epoch": 0.22489844339792214, "loss/policy_avg": 0.4564368724822998, "lr": 9.500894683026585e-06, "objective/entropy": -37.82079315185547, "objective/kl": 40.027137756347656, "objective/non_score_reward": -4.002713203430176, "objective/rlhf_reward": -13.61085424423218, "objective/scores": 0.6, "policy/approxkl_avg": 1.6412748098373413, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6870338320732117, "step": 781, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000164270401001 }, { "episode": 12528, "epoch": 0.22518603731531078, "loss/policy_avg": 0.059905484318733215, "lr": 9.500255623721882e-06, "objective/entropy": -47.8739013671875, "objective/kl": 48.443641662597656, "objective/non_score_reward": -4.844364166259766, "objective/rlhf_reward": -17.430046866612372, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 14.049509048461914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6184900999069214, "step": 782, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0001115798950195 }, { "episode": 12544, "epoch": 0.22547363123269942, "loss/policy_avg": -0.06788864731788635, "lr": 9.499616564417179e-06, "objective/entropy": -196.17083740234375, "objective/kl": 36.355308532714844, "objective/non_score_reward": -3.635531187057495, "objective/rlhf_reward": -13.216611895590944, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 22.939311981201172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6597442626953125, "step": 783, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9999268054962158 }, { "episode": 12560, "epoch": 0.22576122515008806, "loss/policy_avg": 0.66241455078125, "lr": 9.498977505112476e-06, "objective/entropy": -202.2138671875, "objective/kl": 51.065711975097656, "objective/non_score_reward": -5.106571197509766, "objective/rlhf_reward": -19.002451498706904, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.2135827541351318, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4113037586212158, "step": 784, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001005172729492 }, { "episode": 12576, "epoch": 0.22604881906747673, "loss/policy_avg": 1.2668356895446777, "lr": 9.498338445807773e-06, "objective/entropy": 59.70518493652344, "objective/kl": 40.02257537841797, "objective/non_score_reward": -4.002257823944092, "objective/rlhf_reward": -14.585199673374262, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 21.518054962158203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.600081205368042, "step": 785, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000067949295044 }, { "episode": 12592, "epoch": 0.22633641298486537, "loss/policy_avg": 0.017837971448898315, "lr": 9.497699386503068e-06, "objective/entropy": -106.43355560302734, "objective/kl": 45.42462921142578, "objective/non_score_reward": -4.542463302612305, "objective/rlhf_reward": -16.828217556982665, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.8331397771835327, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4324771463871002, "step": 786, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0006937980651855 }, { "episode": 12608, "epoch": 0.226624006902254, "loss/policy_avg": 1.1101531982421875, "lr": 9.497060327198365e-06, "objective/entropy": -100.28545379638672, "objective/kl": 38.377403259277344, "objective/non_score_reward": -3.837740898132324, "objective/rlhf_reward": -14.009327462225585, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 21.72365951538086, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8042199611663818, "step": 787, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995393753051758 }, { "episode": 12624, "epoch": 0.22691160081964268, "loss/policy_avg": 0.9154256582260132, "lr": 9.496421267893662e-06, "objective/entropy": -13.352066040039062, "objective/kl": 41.567588806152344, "objective/non_score_reward": -4.156759262084961, "objective/rlhf_reward": -15.176439146609649, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.859703540802002, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7200378775596619, "step": 788, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981780052185059 }, { "episode": 12640, "epoch": 0.22719919473703132, "loss/policy_avg": 4.065809726715088, "lr": 9.495782208588959e-06, "objective/entropy": -88.02486419677734, "objective/kl": 41.876869201660156, "objective/non_score_reward": -4.187687397003174, "objective/rlhf_reward": -15.409113696127562, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 10.830172538757324, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.42192888259887695, "step": 789, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001798152923584 }, { "episode": 12656, "epoch": 0.22748678865441996, "loss/policy_avg": 1.4616761207580566, "lr": 9.495143149284254e-06, "objective/entropy": 12.189279556274414, "objective/kl": 44.4468994140625, "objective/non_score_reward": -4.444689750671387, "objective/rlhf_reward": -16.453245434790773, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 83.95680236816406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7362450957298279, "step": 790, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999191761016846 }, { "episode": 12672, "epoch": 0.2277743825718086, "loss/policy_avg": 0.05288725346326828, "lr": 9.49450408997955e-06, "objective/entropy": 31.454437255859375, "objective/kl": 40.4713020324707, "objective/non_score_reward": -4.047130107879639, "objective/rlhf_reward": -13.264801417232725, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.5849409103393555, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.9175825119018555, "step": 791, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000244140625 }, { "episode": 12688, "epoch": 0.22806197648919727, "loss/policy_avg": 0.20884034037590027, "lr": 9.493865030674848e-06, "objective/entropy": 129.760986328125, "objective/kl": 42.595794677734375, "objective/non_score_reward": -4.259579658508301, "objective/rlhf_reward": -15.30498506228129, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 18.363981246948242, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.688461184501648, "step": 792, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999577522277832 }, { "episode": 12704, "epoch": 0.2283495704065859, "loss/policy_avg": 0.10811804234981537, "lr": 9.493225971370144e-06, "objective/entropy": -88.04064178466797, "objective/kl": 41.53803253173828, "objective/non_score_reward": -4.153803825378418, "objective/rlhf_reward": -15.215214347839357, "objective/scores": 0.35, "policy/approxkl_avg": 79.52337646484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7973237037658691, "step": 793, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9963350296020508 }, { "episode": 12720, "epoch": 0.22863716432397455, "loss/policy_avg": 1.1484191417694092, "lr": 9.49258691206544e-06, "objective/entropy": 142.89540100097656, "objective/kl": 47.12466049194336, "objective/non_score_reward": -4.712466239929199, "objective/rlhf_reward": -17.49061485502569, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 23.96436882019043, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.38031205534935, "step": 794, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9986920356750488 }, { "episode": 12736, "epoch": 0.22892475824136319, "loss/policy_avg": -0.1955292522907257, "lr": 9.491947852760736e-06, "objective/entropy": -43.12257385253906, "objective/kl": 50.86421203613281, "objective/non_score_reward": -5.086421012878418, "objective/rlhf_reward": -18.398272822575507, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 0.7069367170333862, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.529199481010437, "step": 795, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0014419555664062 }, { "episode": 12752, "epoch": 0.22921235215875185, "loss/policy_avg": 0.12587346136569977, "lr": 9.491308793456033e-06, "objective/entropy": -115.25009155273438, "objective/kl": 24.33888053894043, "objective/non_score_reward": -2.4338879585266113, "objective/rlhf_reward": -8.284953455539092, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 0.4508776366710663, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6505329012870789, "step": 796, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0010581016540527 }, { "episode": 12768, "epoch": 0.2294999460761405, "loss/policy_avg": 0.8157011270523071, "lr": 9.49066973415133e-06, "objective/entropy": -63.3074951171875, "objective/kl": 36.60755920410156, "objective/non_score_reward": -3.6607556343078613, "objective/rlhf_reward": -13.283773624633234, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 144.46054077148438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7697768211364746, "step": 797, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990262985229492 }, { "episode": 12784, "epoch": 0.22978753999352913, "loss/policy_avg": 0.22765851020812988, "lr": 9.490030674846627e-06, "objective/entropy": -187.6090545654297, "objective/kl": 37.852291107177734, "objective/non_score_reward": -3.7852296829223633, "objective/rlhf_reward": -13.40758492151896, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 24.931396484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6455093622207642, "step": 798, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983055591583252 }, { "episode": 12800, "epoch": 0.23007513391091777, "loss/policy_avg": 0.5693266987800598, "lr": 9.489391615541922e-06, "objective/entropy": -236.22152709960938, "objective/kl": 38.294769287109375, "objective/non_score_reward": -3.829477310180664, "objective/rlhf_reward": -13.836956623013378, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.2298102378845215, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6318610906600952, "step": 799, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0006356239318848 }, { "episode": 12816, "epoch": 0.23036272782830644, "loss/policy_avg": 0.7033164501190186, "lr": 9.488752556237219e-06, "objective/entropy": 144.00018310546875, "objective/kl": 39.03528594970703, "objective/non_score_reward": -3.9035286903381348, "objective/rlhf_reward": -13.789286489757608, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.471881866455078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6523069143295288, "step": 800, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000467300415039 }, { "episode": 12832, "epoch": 0.23065032174569508, "loss/policy_avg": 3.3897581100463867, "lr": 9.488113496932516e-06, "objective/entropy": 65.07791137695312, "objective/kl": 33.08277130126953, "objective/non_score_reward": -3.3082773685455322, "objective/rlhf_reward": -11.717337691577608, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 5.002783298492432, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.46239709854125977, "step": 801, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0012996196746826 }, { "episode": 12848, "epoch": 0.23093791566308372, "loss/policy_avg": 0.06800729036331177, "lr": 9.487474437627813e-06, "objective/entropy": -189.33834838867188, "objective/kl": 43.8226432800293, "objective/non_score_reward": -4.382264614105225, "objective/rlhf_reward": -16.1290584564209, "objective/scores": 0.35, "policy/approxkl_avg": 41.669837951660156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6085605621337891, "step": 802, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0008633136749268 }, { "episode": 12864, "epoch": 0.23122550958047236, "loss/policy_avg": 0.5624558329582214, "lr": 9.48683537832311e-06, "objective/entropy": -188.01849365234375, "objective/kl": 42.200706481933594, "objective/non_score_reward": -4.220070838928223, "objective/rlhf_reward": -15.42968450030838, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 14.247980117797852, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6492480635643005, "step": 803, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999314308166504 }, { "episode": 12880, "epoch": 0.23151310349786103, "loss/policy_avg": 1.8033558130264282, "lr": 9.486196319018407e-06, "objective/entropy": -126.62744140625, "objective/kl": 41.638519287109375, "objective/non_score_reward": -4.163851737976074, "objective/rlhf_reward": -14.993547921598541, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.053473711013794, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5263950824737549, "step": 804, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994040727615356 }, { "episode": 12896, "epoch": 0.23180069741524967, "loss/policy_avg": 0.5185251235961914, "lr": 9.485557259713702e-06, "objective/entropy": 25.416759490966797, "objective/kl": 47.11408615112305, "objective/non_score_reward": -4.711408615112305, "objective/rlhf_reward": -17.32986172417038, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 9.99197769165039, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7477856874465942, "step": 805, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990828037261963 }, { "episode": 12912, "epoch": 0.2320882913326383, "loss/policy_avg": 0.3774372935295105, "lr": 9.484918200408999e-06, "objective/entropy": 182.41983032226562, "objective/kl": 56.3293342590332, "objective/non_score_reward": -5.632933616638184, "objective/rlhf_reward": -21.01596125343674, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 109.34550476074219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6443891525268555, "step": 806, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989750385284424 }, { "episode": 12928, "epoch": 0.23237588525002698, "loss/policy_avg": 0.4774811267852783, "lr": 9.484279141104296e-06, "objective/entropy": 19.027976989746094, "objective/kl": 35.1290168762207, "objective/non_score_reward": -3.512901782989502, "objective/rlhf_reward": -12.495347588267876, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.8011960983276367, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4730362296104431, "step": 807, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999955415725708 }, { "episode": 12944, "epoch": 0.23266347916741562, "loss/policy_avg": 0.3621135354042053, "lr": 9.483640081799592e-06, "objective/entropy": 42.96700668334961, "objective/kl": 50.60865020751953, "objective/non_score_reward": -5.0608649253845215, "objective/rlhf_reward": -18.792861084552154, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.702620029449463, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6127219796180725, "step": 808, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987151622772217 }, { "episode": 12960, "epoch": 0.23295107308480426, "loss/policy_avg": 0.24576711654663086, "lr": 9.48300102249489e-06, "objective/entropy": -266.4676513671875, "objective/kl": 42.15456771850586, "objective/non_score_reward": -4.215456962585449, "objective/rlhf_reward": -15.46182737350464, "objective/scores": 0.35, "policy/approxkl_avg": 1.908921718597412, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6165835857391357, "step": 809, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9997169971466064 }, { "episode": 12976, "epoch": 0.2332386670021929, "loss/policy_avg": 0.4992474317550659, "lr": 9.482361963190185e-06, "objective/entropy": 35.967933654785156, "objective/kl": 50.87047576904297, "objective/non_score_reward": -5.087048053741455, "objective/rlhf_reward": -18.74407175547274, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.219294548034668, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6987083554267883, "step": 810, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0008749961853027 }, { "episode": 12992, "epoch": 0.23352626091958156, "loss/policy_avg": 0.47408565878868103, "lr": 9.481722903885481e-06, "objective/entropy": 42.86602783203125, "objective/kl": 46.878440856933594, "objective/non_score_reward": -4.687844276428223, "objective/rlhf_reward": -14.35137782096863, "objective/scores": 1.1, "policy/approxkl_avg": 35.51860046386719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3832439184188843, "step": 811, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0014867782592773 }, { "episode": 13008, "epoch": 0.2338138548369702, "loss/policy_avg": 0.1151239275932312, "lr": 9.481083844580777e-06, "objective/entropy": -227.46157836914062, "objective/kl": 39.24964904785156, "objective/non_score_reward": -3.924964666366577, "objective/rlhf_reward": -14.321256973830561, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 21.682876586914062, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5061876773834229, "step": 812, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995512962341309 }, { "episode": 13024, "epoch": 0.23410144875435884, "loss/policy_avg": 1.2807607650756836, "lr": 9.480444785276073e-06, "objective/entropy": 219.10873413085938, "objective/kl": 60.6810188293457, "objective/non_score_reward": -6.06810188293457, "objective/rlhf_reward": -22.756634318622286, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 107.69674682617188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.541488528251648, "step": 813, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9991263151168823 }, { "episode": 13040, "epoch": 0.23438904267174748, "loss/policy_avg": 0.5082242488861084, "lr": 9.47980572597137e-06, "objective/entropy": -262.63189697265625, "objective/kl": 41.65293884277344, "objective/non_score_reward": -4.1652936935424805, "objective/rlhf_reward": -15.180222394879223, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 41.77598571777344, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4324526786804199, "step": 814, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.003247022628784 }, { "episode": 13056, "epoch": 0.23467663658913615, "loss/policy_avg": -0.10035756230354309, "lr": 9.479166666666667e-06, "objective/entropy": 134.06137084960938, "objective/kl": 44.254085540771484, "objective/non_score_reward": -4.425408363342285, "objective/rlhf_reward": -15.876805897029946, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 23.891380310058594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.621482253074646, "step": 815, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0046777725219727 }, { "episode": 13072, "epoch": 0.2349642305065248, "loss/policy_avg": 0.27624276280403137, "lr": 9.478527607361964e-06, "objective/entropy": -135.5484619140625, "objective/kl": 44.93092346191406, "objective/non_score_reward": -4.4930925369262695, "objective/rlhf_reward": -16.147539730342935, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 1.5283265113830566, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6181260347366333, "step": 816, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000763416290283 }, { "episode": 13088, "epoch": 0.23525182442391343, "loss/policy_avg": 0.5251176357269287, "lr": 9.477888548057261e-06, "objective/entropy": -254.23443603515625, "objective/kl": 47.79570770263672, "objective/non_score_reward": -4.779571533203125, "objective/rlhf_reward": -17.739683725921015, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.8940521478652954, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5922014713287354, "step": 817, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001004934310913 }, { "episode": 13104, "epoch": 0.23553941834130207, "loss/policy_avg": 0.9870522022247314, "lr": 9.477249488752556e-06, "objective/entropy": 18.714473724365234, "objective/kl": 32.423614501953125, "objective/non_score_reward": -3.242361545562744, "objective/rlhf_reward": -11.545614500244227, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 83.56348419189453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4189128279685974, "step": 818, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0020997524261475 }, { "episode": 13120, "epoch": 0.23582701225869074, "loss/policy_avg": 0.4610193073749542, "lr": 9.476610429447853e-06, "objective/entropy": -88.04772186279297, "objective/kl": 39.214500427246094, "objective/non_score_reward": -3.921450138092041, "objective/rlhf_reward": -14.285800552368164, "objective/scores": 0.35, "policy/approxkl_avg": 13.349451065063477, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5404248237609863, "step": 819, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971152544021606 }, { "episode": 13136, "epoch": 0.23611460617607938, "loss/policy_avg": 0.6160886287689209, "lr": 9.47597137014315e-06, "objective/entropy": -271.970458984375, "objective/kl": 37.36911392211914, "objective/non_score_reward": -3.7369112968444824, "objective/rlhf_reward": -13.523813803394404, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 18.014442443847656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6574091911315918, "step": 820, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9968414306640625 }, { "episode": 13152, "epoch": 0.23640220009346802, "loss/policy_avg": 4.127373695373535, "lr": 9.475332310838447e-06, "objective/entropy": 22.463993072509766, "objective/kl": 45.64144515991211, "objective/non_score_reward": -4.564144611358643, "objective/rlhf_reward": -16.523244873682657, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 48.066158294677734, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.48151886463165283, "step": 821, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0003304481506348 }, { "episode": 13168, "epoch": 0.23668979401085666, "loss/policy_avg": 0.785558819770813, "lr": 9.474693251533744e-06, "objective/entropy": -93.47967529296875, "objective/kl": 45.30268096923828, "objective/non_score_reward": -4.530268669128418, "objective/rlhf_reward": -16.79556087020032, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 50.31500244140625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4456283450126648, "step": 822, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997105360031128 }, { "episode": 13184, "epoch": 0.23697738792824533, "loss/policy_avg": 1.3671177625656128, "lr": 9.474054192229039e-06, "objective/entropy": -68.2393798828125, "objective/kl": 52.268829345703125, "objective/non_score_reward": -5.2268829345703125, "objective/rlhf_reward": -18.507531261444093, "objective/scores": 0.6, "policy/approxkl_avg": 21.187942504882812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5112272500991821, "step": 823, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0015244483947754 }, { "episode": 13200, "epoch": 0.23726498184563397, "loss/policy_avg": 1.411192536354065, "lr": 9.473415132924336e-06, "objective/entropy": 139.8941192626953, "objective/kl": 52.27011489868164, "objective/non_score_reward": -5.227011680603027, "objective/rlhf_reward": -19.50804648399353, "objective/scores": 0.35, "policy/approxkl_avg": 10.510537147521973, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.588684618473053, "step": 824, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999821424484253 }, { "episode": 13216, "epoch": 0.2375525757630226, "loss/policy_avg": 2.240239143371582, "lr": 9.472776073619633e-06, "objective/entropy": 27.470577239990234, "objective/kl": 59.331642150878906, "objective/non_score_reward": -5.933164119720459, "objective/rlhf_reward": -20.808937226177427, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.889102458953857, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.567642331123352, "step": 825, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986729621887207 }, { "episode": 13232, "epoch": 0.23784016968041127, "loss/policy_avg": 0.21238625049591064, "lr": 9.47213701431493e-06, "objective/entropy": -97.9186782836914, "objective/kl": 42.333152770996094, "objective/non_score_reward": -4.2333149909973145, "objective/rlhf_reward": -15.482662062259063, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 11.206239700317383, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5994842052459717, "step": 826, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0001232624053955 }, { "episode": 13248, "epoch": 0.2381277635977999, "loss/policy_avg": 0.1261046975851059, "lr": 9.471497955010226e-06, "objective/entropy": -33.41431427001953, "objective/kl": 44.24114990234375, "objective/non_score_reward": -4.42411470413208, "objective/rlhf_reward": -14.772739563823912, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 17.684558868408203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6327311396598816, "step": 827, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000532865524292 }, { "episode": 13264, "epoch": 0.23841535751518855, "loss/policy_avg": 0.5212262868881226, "lr": 9.470858895705523e-06, "objective/entropy": -101.5107192993164, "objective/kl": 46.10678482055664, "objective/non_score_reward": -4.610678672790527, "objective/rlhf_reward": -16.838595185343344, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 15.35481071472168, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5887008905410767, "step": 828, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996135234832764 }, { "episode": 13280, "epoch": 0.2387029514325772, "loss/policy_avg": 0.6297311782836914, "lr": 9.470219836400818e-06, "objective/entropy": 17.642807006835938, "objective/kl": 45.446571350097656, "objective/non_score_reward": -4.544657230377197, "objective/rlhf_reward": -16.445295349756876, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 25.065134048461914, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6857582330703735, "step": 829, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999565839767456 }, { "episode": 13296, "epoch": 0.23899054534996586, "loss/policy_avg": 1.2870714664459229, "lr": 9.469580777096115e-06, "objective/entropy": -112.94920349121094, "objective/kl": 41.829681396484375, "objective/non_score_reward": -4.182968616485596, "objective/rlhf_reward": -14.331874227523805, "objective/scores": 0.6, "policy/approxkl_avg": 43.81898498535156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5229564905166626, "step": 830, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994326829910278 }, { "episode": 13312, "epoch": 0.2392781392673545, "loss/policy_avg": 1.0273782014846802, "lr": 9.468941717791412e-06, "objective/entropy": -7.699493408203125, "objective/kl": 37.84484100341797, "objective/non_score_reward": -3.7844836711883545, "objective/rlhf_reward": -13.015229167715582, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 17.57620620727539, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4471469223499298, "step": 831, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9978184700012207 }, { "episode": 13328, "epoch": 0.23956573318474314, "loss/policy_avg": 0.7235412001609802, "lr": 9.468302658486709e-06, "objective/entropy": 106.56259155273438, "objective/kl": 60.04669189453125, "objective/non_score_reward": -6.004669189453125, "objective/rlhf_reward": -22.640074827758173, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 64.07603454589844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5924360752105713, "step": 832, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997698426246643 }, { "episode": 13344, "epoch": 0.23985332710213178, "loss/policy_avg": 0.26122790575027466, "lr": 9.467663599182006e-06, "objective/entropy": -15.060523986816406, "objective/kl": 56.78717803955078, "objective/non_score_reward": -5.678718090057373, "objective/rlhf_reward": -21.373235753088622, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 123.58401489257812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.44901490211486816, "step": 833, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980285167694092 }, { "episode": 13360, "epoch": 0.24014092101952045, "loss/policy_avg": -0.059971150010824203, "lr": 9.467024539877301e-06, "objective/entropy": -209.9939422607422, "objective/kl": 39.02555847167969, "objective/non_score_reward": -3.9025564193725586, "objective/rlhf_reward": -13.948365693510162, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.0669580698013306, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6912230253219604, "step": 834, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0006463527679443 }, { "episode": 13376, "epoch": 0.2404285149369091, "loss/policy_avg": 0.13007503747940063, "lr": 9.466385480572598e-06, "objective/entropy": 166.1580352783203, "objective/kl": 43.581153869628906, "objective/non_score_reward": -4.358116149902344, "objective/rlhf_reward": -13.03246364593506, "objective/scores": 1.1, "policy/approxkl_avg": 8.686347007751465, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7475668787956238, "step": 835, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001697540283203 }, { "episode": 13392, "epoch": 0.24071610885429773, "loss/policy_avg": 0.07029886543750763, "lr": 9.465746421267893e-06, "objective/entropy": -160.5312957763672, "objective/kl": 39.0033073425293, "objective/non_score_reward": -3.9003307819366455, "objective/rlhf_reward": -14.120369794781567, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 38.296531677246094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3903728723526001, "step": 836, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994573593139648 }, { "episode": 13408, "epoch": 0.24100370277168637, "loss/policy_avg": -0.24450919032096863, "lr": 9.46510736196319e-06, "objective/entropy": 225.25355529785156, "objective/kl": 36.30988693237305, "objective/non_score_reward": -3.630988597869873, "objective/rlhf_reward": -13.043001535351634, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 33.346378326416016, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5566978454589844, "step": 837, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000542163848877 }, { "episode": 13424, "epoch": 0.24129129668907504, "loss/policy_avg": 0.28000321984291077, "lr": 9.464468302658487e-06, "objective/entropy": -151.39920043945312, "objective/kl": 36.01958465576172, "objective/non_score_reward": -3.6019582748413086, "objective/rlhf_reward": -12.674499527613321, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.0946550369262695, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.2960823178291321, "step": 838, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0004286766052246 }, { "episode": 13440, "epoch": 0.24157889060646368, "loss/policy_avg": 1.0798143148422241, "lr": 9.463829243353784e-06, "objective/entropy": -37.02276611328125, "objective/kl": 38.723487854003906, "objective/non_score_reward": -3.8723487854003906, "objective/rlhf_reward": -14.0655630423623, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 69.36178588867188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5190849304199219, "step": 839, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9978101253509521 }, { "episode": 13456, "epoch": 0.24186648452385232, "loss/policy_avg": 0.8364774584770203, "lr": 9.46319018404908e-06, "objective/entropy": 62.59022521972656, "objective/kl": 39.46584701538086, "objective/non_score_reward": -3.946584939956665, "objective/rlhf_reward": -14.444703629522948, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 19.215335845947266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.44274285435676575, "step": 840, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990289211273193 }, { "episode": 13472, "epoch": 0.24215407844124096, "loss/policy_avg": 0.9193323254585266, "lr": 9.462551124744378e-06, "objective/entropy": 101.40837097167969, "objective/kl": 41.96873474121094, "objective/non_score_reward": -4.196873664855957, "objective/rlhf_reward": -15.336895804019317, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 9.205939292907715, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.28277623653411865, "step": 841, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9998894929885864 }, { "episode": 13488, "epoch": 0.24244167235862962, "loss/policy_avg": 0.03237959370017052, "lr": 9.461912065439673e-06, "objective/entropy": -146.46066284179688, "objective/kl": 36.374385833740234, "objective/non_score_reward": -3.6374387741088867, "objective/rlhf_reward": -10.149754858016967, "objective/scores": 1.1, "policy/approxkl_avg": 0.5134851336479187, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3451271057128906, "step": 842, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002326726913452 }, { "episode": 13504, "epoch": 0.24272926627601826, "loss/policy_avg": 0.12626682221889496, "lr": 9.46127300613497e-06, "objective/entropy": -148.53372192382812, "objective/kl": 43.185630798339844, "objective/non_score_reward": -4.318563461303711, "objective/rlhf_reward": -14.87425241470337, "objective/scores": 0.6, "policy/approxkl_avg": 5.643270969390869, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6915034651756287, "step": 843, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.01680326461792 }, { "episode": 13520, "epoch": 0.2430168601934069, "loss/policy_avg": -0.19212225079536438, "lr": 9.460633946830267e-06, "objective/entropy": -70.89179229736328, "objective/kl": 49.2342529296875, "objective/non_score_reward": -4.923425197601318, "objective/rlhf_reward": -15.293701267242433, "objective/scores": 1.1, "policy/approxkl_avg": 3.8346972465515137, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.40477171540260315, "step": 844, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0019445419311523 }, { "episode": 13536, "epoch": 0.24330445411079557, "loss/policy_avg": 2.627497434616089, "lr": 9.459994887525563e-06, "objective/entropy": 129.54180908203125, "objective/kl": 53.16524887084961, "objective/non_score_reward": -5.316524505615234, "objective/rlhf_reward": -18.866099452972414, "objective/scores": 0.6, "policy/approxkl_avg": 23.311412811279297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5597689747810364, "step": 845, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994431734085083 }, { "episode": 13552, "epoch": 0.2435920480281842, "loss/policy_avg": 0.1320989578962326, "lr": 9.45935582822086e-06, "objective/entropy": 152.18173217773438, "objective/kl": 43.19120788574219, "objective/non_score_reward": -4.319120407104492, "objective/rlhf_reward": -15.451653595241616, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.6365280151367188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.70622318983078, "step": 846, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.003366470336914 }, { "episode": 13568, "epoch": 0.24387964194557285, "loss/policy_avg": -0.20065978169441223, "lr": 9.458716768916156e-06, "objective/entropy": -52.614356994628906, "objective/kl": 30.15610694885254, "objective/non_score_reward": -3.015610694885254, "objective/rlhf_reward": -10.736930403739137, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.9257869720458984, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4547704756259918, "step": 847, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0025835037231445 }, { "episode": 13584, "epoch": 0.2441672358629615, "loss/policy_avg": 2.7808499336242676, "lr": 9.458077709611452e-06, "objective/entropy": -88.72569274902344, "objective/kl": 41.056114196777344, "objective/non_score_reward": -4.105611324310303, "objective/rlhf_reward": -14.99861272116479, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 13.171248435974121, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.636700451374054, "step": 848, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982078075408936 }, { "episode": 13600, "epoch": 0.24445482978035016, "loss/policy_avg": 0.17049476504325867, "lr": 9.45743865030675e-06, "objective/entropy": -14.615028381347656, "objective/kl": 44.74107360839844, "objective/non_score_reward": -4.47410774230957, "objective/rlhf_reward": -16.380659901889498, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 23.5351619720459, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.42227408289909363, "step": 849, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997400045394897 }, { "episode": 13616, "epoch": 0.2447424236977388, "loss/policy_avg": -0.5285428166389465, "lr": 9.456799591002046e-06, "objective/entropy": -304.7969970703125, "objective/kl": 40.3505973815918, "objective/non_score_reward": -4.035059928894043, "objective/rlhf_reward": -14.814726147681398, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 14.866556167602539, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5371728539466858, "step": 850, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0025596618652344 }, { "episode": 13632, "epoch": 0.24503001761512744, "loss/policy_avg": 0.7969543933868408, "lr": 9.456160531697343e-06, "objective/entropy": 73.470947265625, "objective/kl": 38.98177719116211, "objective/non_score_reward": -3.8981776237487793, "objective/rlhf_reward": -13.859377400080362, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 9.109411239624023, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5251634120941162, "step": 851, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0007219314575195 }, { "episode": 13648, "epoch": 0.24531761153251608, "loss/policy_avg": 1.060788631439209, "lr": 9.45552147239264e-06, "objective/entropy": -220.52978515625, "objective/kl": 39.64278793334961, "objective/non_score_reward": -3.96427845954895, "objective/rlhf_reward": -11.457114553451538, "objective/scores": 1.1, "policy/approxkl_avg": 20.378562927246094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5584784150123596, "step": 852, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998639702796936 }, { "episode": 13664, "epoch": 0.24560520544990475, "loss/policy_avg": 1.2480220794677734, "lr": 9.454882413087935e-06, "objective/entropy": 107.21774291992188, "objective/kl": 45.63555908203125, "objective/non_score_reward": -4.563555717468262, "objective/rlhf_reward": -16.429394836696694, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 19.441165924072266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.575886607170105, "step": 853, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971050024032593 }, { "episode": 13680, "epoch": 0.24589279936729339, "loss/policy_avg": 0.4997670352458954, "lr": 9.454243353783232e-06, "objective/entropy": 55.699459075927734, "objective/kl": 50.389007568359375, "objective/non_score_reward": -5.038900375366211, "objective/rlhf_reward": -18.63983019569748, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 0.9880640506744385, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5993074178695679, "step": 854, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001983880996704 }, { "episode": 13696, "epoch": 0.24618039328468203, "loss/policy_avg": 0.6334704756736755, "lr": 9.453604294478529e-06, "objective/entropy": -116.13612365722656, "objective/kl": 49.5648193359375, "objective/non_score_reward": -4.956482410430908, "objective/rlhf_reward": -18.425929164886476, "objective/scores": 0.35, "policy/approxkl_avg": 26.174985885620117, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7205100059509277, "step": 855, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990193843841553 }, { "episode": 13712, "epoch": 0.24646798720207067, "loss/policy_avg": -0.25992709398269653, "lr": 9.452965235173824e-06, "objective/entropy": -101.61711120605469, "objective/kl": 60.37242889404297, "objective/non_score_reward": -6.037242889404297, "objective/rlhf_reward": -22.59271368285711, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 28.445724487304688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7341337203979492, "step": 856, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9983941316604614 }, { "episode": 13728, "epoch": 0.24675558111945933, "loss/policy_avg": 1.5085573196411133, "lr": 9.452326175869121e-06, "objective/entropy": -4.551849365234375, "objective/kl": 51.28207015991211, "objective/non_score_reward": -5.128207206726074, "objective/rlhf_reward": -18.95656809112127, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 35.152862548828125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5039411187171936, "step": 857, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979238510131836 }, { "episode": 13744, "epoch": 0.24704317503684797, "loss/policy_avg": 0.3509300947189331, "lr": 9.451687116564418e-06, "objective/entropy": -296.17901611328125, "objective/kl": 22.41075897216797, "objective/non_score_reward": -2.2410757541656494, "objective/rlhf_reward": -6.841596784369026, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 24.681076049804688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7068237662315369, "step": 858, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.9986300468444824 }, { "episode": 13760, "epoch": 0.2473307689542366, "loss/policy_avg": 0.4679286777973175, "lr": 9.451048057259715e-06, "objective/entropy": -0.08905029296875, "objective/kl": 46.414649963378906, "objective/non_score_reward": -4.641464710235596, "objective/rlhf_reward": -17.240346703559084, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 44.908973693847656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.44072186946868896, "step": 859, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997888207435608 }, { "episode": 13776, "epoch": 0.24761836287162525, "loss/policy_avg": 1.0680372714996338, "lr": 9.45040899795501e-06, "objective/entropy": -286.050537109375, "objective/kl": 41.66375732421875, "objective/non_score_reward": -4.166375637054443, "objective/rlhf_reward": -16.665502786636353, "objective/scores": 0.0, "policy/approxkl_avg": 30.8399715423584, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6297662854194641, "step": 860, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.997706413269043 }, { "episode": 13792, "epoch": 0.24790595678901392, "loss/policy_avg": 0.23324424028396606, "lr": 9.449769938650307e-06, "objective/entropy": -387.7367858886719, "objective/kl": 42.02001953125, "objective/non_score_reward": -4.202002048492432, "objective/rlhf_reward": -14.860596249775824, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 12.314942359924316, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7003234028816223, "step": 861, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979217052459717 }, { "episode": 13808, "epoch": 0.24819355070640256, "loss/policy_avg": 0.7436450719833374, "lr": 9.449130879345604e-06, "objective/entropy": -354.4407958984375, "objective/kl": 36.00145721435547, "objective/non_score_reward": -3.6001460552215576, "objective/rlhf_reward": -12.919631603176953, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.0306484699249268, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5300300717353821, "step": 862, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0001163482666016 }, { "episode": 13824, "epoch": 0.2484811446237912, "loss/policy_avg": 0.6046145558357239, "lr": 9.4484918200409e-06, "objective/entropy": -235.1295166015625, "objective/kl": 38.761863708496094, "objective/non_score_reward": -3.8761868476867676, "objective/rlhf_reward": -13.557335923390326, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 30.976497650146484, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6000299453735352, "step": 863, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9970474243164062 }, { "episode": 13840, "epoch": 0.24876873854117984, "loss/policy_avg": 0.21184206008911133, "lr": 9.447852760736197e-06, "objective/entropy": -39.172943115234375, "objective/kl": 42.02351379394531, "objective/non_score_reward": -4.2023515701293945, "objective/rlhf_reward": -15.385573704441157, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 19.583112716674805, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5341065526008606, "step": 864, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9986238479614258 }, { "episode": 13856, "epoch": 0.2490563324585685, "loss/policy_avg": 0.4282435178756714, "lr": 9.447213701431494e-06, "objective/entropy": -285.52001953125, "objective/kl": 42.929405212402344, "objective/non_score_reward": -4.292940139770508, "objective/rlhf_reward": -15.771761751174928, "objective/scores": 0.35, "policy/approxkl_avg": 3.497749090194702, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7426670789718628, "step": 865, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9992247819900513 }, { "episode": 13872, "epoch": 0.24934392637595715, "loss/policy_avg": 0.29745015501976013, "lr": 9.44657464212679e-06, "objective/entropy": 46.30607604980469, "objective/kl": 45.067955017089844, "objective/non_score_reward": -4.506795406341553, "objective/rlhf_reward": -16.701668295890016, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 41.62073516845703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.501271665096283, "step": 866, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9972460269927979 }, { "episode": 13888, "epoch": 0.2496315202933458, "loss/policy_avg": 0.5706069469451904, "lr": 9.445935582822086e-06, "objective/entropy": 61.499298095703125, "objective/kl": 44.0521240234375, "objective/non_score_reward": -4.405212879180908, "objective/rlhf_reward": -16.064591257777764, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.744949340820312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4016445279121399, "step": 867, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990034103393555 }, { "episode": 13904, "epoch": 0.24991911421073446, "loss/policy_avg": 1.6848845481872559, "lr": 9.445296523517383e-06, "objective/entropy": -261.97088623046875, "objective/kl": 37.381866455078125, "objective/non_score_reward": -3.738186836242676, "objective/rlhf_reward": -13.127918358120034, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.18919563293457, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4207575023174286, "step": 868, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9996562004089355 }, { "episode": 13920, "epoch": 0.25020670812812307, "loss/policy_avg": 1.0744154453277588, "lr": 9.44465746421268e-06, "objective/entropy": -269.3877258300781, "objective/kl": 41.17782974243164, "objective/non_score_reward": -4.117783069610596, "objective/rlhf_reward": -14.990179660733105, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 44.93556213378906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.567723274230957, "step": 869, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998097538948059 }, { "episode": 13936, "epoch": 0.25049430204551176, "loss/policy_avg": -0.518259584903717, "lr": 9.444018404907977e-06, "objective/entropy": -99.76119995117188, "objective/kl": 58.50865936279297, "objective/non_score_reward": -5.850865840911865, "objective/rlhf_reward": -21.456051657872138, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 17.84606170654297, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.45481520891189575, "step": 870, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9988923072814941 }, { "episode": 13952, "epoch": 0.2507818959629004, "loss/policy_avg": -0.3090100586414337, "lr": 9.443379345603272e-06, "objective/entropy": -71.82542419433594, "objective/kl": 40.61528778076172, "objective/non_score_reward": -4.06152868270874, "objective/rlhf_reward": -14.641994628969748, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.262547492980957, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.408467173576355, "step": 871, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0034618377685547 }, { "episode": 13968, "epoch": 0.25106948988028904, "loss/policy_avg": 0.9268704652786255, "lr": 9.442740286298569e-06, "objective/entropy": -257.0228271484375, "objective/kl": 41.92462158203125, "objective/non_score_reward": -4.192461967468262, "objective/rlhf_reward": -15.036515728632608, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 27.330509185791016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.49810606241226196, "step": 872, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987335205078125 }, { "episode": 13984, "epoch": 0.2513570837976777, "loss/policy_avg": 0.6296570301055908, "lr": 9.442101226993866e-06, "objective/entropy": -389.610595703125, "objective/kl": 35.68389892578125, "objective/non_score_reward": -3.568390130996704, "objective/rlhf_reward": -12.448731775554727, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 16.5224609375, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.7835655212402344, "step": 873, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999846339225769 }, { "episode": 14000, "epoch": 0.2516446777150663, "loss/policy_avg": 0.4277964234352112, "lr": 9.441462167689163e-06, "objective/entropy": -170.8277587890625, "objective/kl": 38.57947540283203, "objective/non_score_reward": -3.85794734954834, "objective/rlhf_reward": -13.827669653956015, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 44.01490783691406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6223859786987305, "step": 874, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000417470932007 }, { "episode": 14016, "epoch": 0.25193227163245496, "loss/policy_avg": 0.8091301918029785, "lr": 9.44082310838446e-06, "objective/entropy": -39.32908630371094, "objective/kl": 39.68909454345703, "objective/non_score_reward": -3.968909740447998, "objective/rlhf_reward": -13.752931775824102, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 28.2178897857666, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6742968559265137, "step": 875, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996860027313232 }, { "episode": 14032, "epoch": 0.2522198655498436, "loss/policy_avg": 0.743851900100708, "lr": 9.440184049079757e-06, "objective/entropy": -351.21734619140625, "objective/kl": 39.85813903808594, "objective/non_score_reward": -3.985814094543457, "objective/rlhf_reward": -14.209922806421915, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 59.21538543701172, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5436392426490784, "step": 876, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.997950792312622 }, { "episode": 14048, "epoch": 0.25250745946723224, "loss/policy_avg": 1.5350085496902466, "lr": 9.439544989775052e-06, "objective/entropy": -65.44673156738281, "objective/kl": 50.07268524169922, "objective/non_score_reward": -5.00726842880249, "objective/rlhf_reward": -17.906367959753545, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 29.94268226623535, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7696713209152222, "step": 877, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0026378631591797 }, { "episode": 14064, "epoch": 0.25279505338462094, "loss/policy_avg": 3.7316019535064697, "lr": 9.438905930470349e-06, "objective/entropy": -307.8963623046875, "objective/kl": 38.285011291503906, "objective/non_score_reward": -3.828500747680664, "objective/rlhf_reward": -13.652143721998321, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 28.130430221557617, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8377779722213745, "step": 878, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0074470043182373 }, { "episode": 14080, "epoch": 0.2530826473020096, "loss/policy_avg": 0.38605859875679016, "lr": 9.438266871165644e-06, "objective/entropy": -106.09515380859375, "objective/kl": 36.02558517456055, "objective/non_score_reward": -3.6025586128234863, "objective/rlhf_reward": -12.46282274551862, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 7.791407585144043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48136669397354126, "step": 879, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982140064239502 }, { "episode": 14096, "epoch": 0.2533702412193982, "loss/policy_avg": -0.01890498399734497, "lr": 9.43762781186094e-06, "objective/entropy": -181.742431640625, "objective/kl": 42.437530517578125, "objective/non_score_reward": -4.243752956390381, "objective/rlhf_reward": -15.649499688178224, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 15.317750930786133, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.511642575263977, "step": 880, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0014102458953857 }, { "episode": 14112, "epoch": 0.25365783513678686, "loss/policy_avg": 0.030795343220233917, "lr": 9.436988752556238e-06, "objective/entropy": -230.95361328125, "objective/kl": 30.112140655517578, "objective/non_score_reward": -3.011213779449463, "objective/rlhf_reward": -9.12113658034918, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.038021087646484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5518423914909363, "step": 881, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9984687566757202 }, { "episode": 14128, "epoch": 0.2539454290541755, "loss/policy_avg": 0.4279516935348511, "lr": 9.436349693251534e-06, "objective/entropy": -245.98117065429688, "objective/kl": 43.55865478515625, "objective/non_score_reward": -4.355865478515625, "objective/rlhf_reward": -15.023462629318239, "objective/scores": 0.6, "policy/approxkl_avg": 2.3968582153320312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7229801416397095, "step": 882, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0013785362243652 }, { "episode": 14144, "epoch": 0.25423302297156414, "loss/policy_avg": 0.7990102767944336, "lr": 9.435710633946831e-06, "objective/entropy": -141.9425506591797, "objective/kl": 36.01177215576172, "objective/non_score_reward": -3.60117769241333, "objective/rlhf_reward": -12.95411191424881, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 35.52224349975586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6370751857757568, "step": 883, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986944198608398 }, { "episode": 14160, "epoch": 0.2545206168889528, "loss/policy_avg": 1.1767592430114746, "lr": 9.435071574642126e-06, "objective/entropy": -3.1976966857910156, "objective/kl": 52.58678436279297, "objective/non_score_reward": -5.258677959442139, "objective/rlhf_reward": -18.11099377715704, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 64.22396850585938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5332536697387695, "step": 884, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978070259094238 }, { "episode": 14176, "epoch": 0.2548082108063415, "loss/policy_avg": 0.09251243621110916, "lr": 9.434432515337423e-06, "objective/entropy": -235.44993591308594, "objective/kl": 47.086631774902344, "objective/non_score_reward": -4.708662986755371, "objective/rlhf_reward": -17.10131909052531, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 12.522943496704102, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6954100131988525, "step": 885, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9970701932907104 }, { "episode": 14192, "epoch": 0.2550958047237301, "loss/policy_avg": 0.7762466073036194, "lr": 9.43379345603272e-06, "objective/entropy": -65.08514404296875, "objective/kl": 34.25334930419922, "objective/non_score_reward": -3.425334930419922, "objective/rlhf_reward": -12.37582734587781, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 10.137767791748047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6340222358703613, "step": 886, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997950792312622 }, { "episode": 14208, "epoch": 0.25538339864111875, "loss/policy_avg": 0.4137943387031555, "lr": 9.433154396728017e-06, "objective/entropy": -208.6219940185547, "objective/kl": 48.540523529052734, "objective/non_score_reward": -4.854052543640137, "objective/rlhf_reward": -19.41620969772339, "objective/scores": 0.0, "policy/approxkl_avg": 6.9052228927612305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6014610528945923, "step": 887, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984641075134277 }, { "episode": 14224, "epoch": 0.2556709925585074, "loss/policy_avg": 0.041199300438165665, "lr": 9.432515337423314e-06, "objective/entropy": 46.358150482177734, "objective/kl": 47.68404769897461, "objective/non_score_reward": -4.768404960632324, "objective/rlhf_reward": -16.14990035140631, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 10.196189880371094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6130638122558594, "step": 888, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0053634643554688 }, { "episode": 14240, "epoch": 0.25595858647589603, "loss/policy_avg": 0.9656521081924438, "lr": 9.431876278118611e-06, "objective/entropy": -300.8107604980469, "objective/kl": 40.519744873046875, "objective/non_score_reward": -4.051974296569824, "objective/rlhf_reward": -14.829295017806391, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 155.98867797851562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6544891595840454, "step": 889, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9983346462249756 }, { "episode": 14256, "epoch": 0.2562461803932847, "loss/policy_avg": 1.5469386577606201, "lr": 9.431237218813906e-06, "objective/entropy": -53.928260803222656, "objective/kl": 51.69367599487305, "objective/non_score_reward": -5.169367790222168, "objective/rlhf_reward": -19.318220340941828, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 15.555410385131836, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5645780563354492, "step": 890, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9977774620056152 }, { "episode": 14272, "epoch": 0.2565337743106733, "loss/policy_avg": 1.4335881471633911, "lr": 9.430598159509203e-06, "objective/entropy": -98.00782775878906, "objective/kl": 39.847450256347656, "objective/non_score_reward": -3.9847452640533447, "objective/rlhf_reward": -14.334861550394614, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 28.548376083374023, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.697210431098938, "step": 891, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980844259262085 }, { "episode": 14288, "epoch": 0.25682136822806195, "loss/policy_avg": 1.1344060897827148, "lr": 9.4299591002045e-06, "objective/entropy": -38.29204177856445, "objective/kl": 40.507850646972656, "objective/non_score_reward": -4.050785064697266, "objective/rlhf_reward": -14.541280036390411, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 48.63218307495117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5049742460250854, "step": 892, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9978201389312744 }, { "episode": 14304, "epoch": 0.25710896214545065, "loss/policy_avg": -0.35933157801628113, "lr": 9.429320040899797e-06, "objective/entropy": -117.44267272949219, "objective/kl": 30.698312759399414, "objective/non_score_reward": -3.069831371307373, "objective/rlhf_reward": -10.617466216505157, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.3441765308380127, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3006622791290283, "step": 893, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0008156299591064 }, { "episode": 14320, "epoch": 0.2573965560628393, "loss/policy_avg": 0.8382015824317932, "lr": 9.428680981595094e-06, "objective/entropy": -200.59832763671875, "objective/kl": 43.460121154785156, "objective/non_score_reward": -4.346012115478516, "objective/rlhf_reward": -15.868276679309542, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 12.426078796386719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5876235961914062, "step": 894, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994069337844849 }, { "episode": 14336, "epoch": 0.25768414998022793, "loss/policy_avg": 0.687677264213562, "lr": 9.42804192229039e-06, "objective/entropy": 203.2930908203125, "objective/kl": 59.16074752807617, "objective/non_score_reward": -5.916074752807617, "objective/rlhf_reward": -21.839470739635537, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 170.25958251953125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6621222496032715, "step": 895, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983516931533813 }, { "episode": 14352, "epoch": 0.25797174389761657, "loss/policy_avg": -0.23158738017082214, "lr": 9.427402862985686e-06, "objective/entropy": -174.0965118408203, "objective/kl": 45.058555603027344, "objective/non_score_reward": -4.505855560302734, "objective/rlhf_reward": -16.697908911734743, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 19.966630935668945, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6067174673080444, "step": 896, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999699592590332 }, { "episode": 14368, "epoch": 0.2582593378150052, "loss/policy_avg": -0.5135414600372314, "lr": 9.426763803680982e-06, "objective/entropy": 179.40419006347656, "objective/kl": 59.1436882019043, "objective/non_score_reward": -5.914369106292725, "objective/rlhf_reward": -22.233643849094477, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 36.81080627441406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 1.0215977430343628, "step": 897, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994231462478638 }, { "episode": 14384, "epoch": 0.25854693173239385, "loss/policy_avg": 0.39005035161972046, "lr": 9.42612474437628e-06, "objective/entropy": -332.71807861328125, "objective/kl": 42.75315856933594, "objective/non_score_reward": -4.27531623840332, "objective/rlhf_reward": -12.70126543045044, "objective/scores": 1.1, "policy/approxkl_avg": 27.193714141845703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46848946809768677, "step": 898, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9977561235427856 }, { "episode": 14400, "epoch": 0.2588345256497825, "loss/policy_avg": -0.6668556928634644, "lr": 9.425485685071576e-06, "objective/entropy": -235.47409057617188, "objective/kl": 35.54128646850586, "objective/non_score_reward": -3.554128646850586, "objective/rlhf_reward": -12.735561969693066, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 42.73648452758789, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5194951295852661, "step": 899, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0026841163635254 }, { "episode": 14416, "epoch": 0.2591221195671711, "loss/policy_avg": 0.5247483849525452, "lr": 9.424846625766873e-06, "objective/entropy": -204.9716339111328, "objective/kl": 38.4921989440918, "objective/non_score_reward": -3.849219799041748, "objective/rlhf_reward": -13.663545624415079, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 50.06997299194336, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6830487251281738, "step": 900, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986555576324463 }, { "episode": 14432, "epoch": 0.2594097134845598, "loss/policy_avg": 0.6800730228424072, "lr": 9.424207566462168e-06, "objective/entropy": -32.36799240112305, "objective/kl": 47.80828857421875, "objective/non_score_reward": -4.780828952789307, "objective/rlhf_reward": -17.519195351664145, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 79.11212921142578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6230674982070923, "step": 901, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99847412109375 }, { "episode": 14448, "epoch": 0.25969730740194846, "loss/policy_avg": 0.290429025888443, "lr": 9.423568507157465e-06, "objective/entropy": -182.06155395507812, "objective/kl": 50.57691955566406, "objective/non_score_reward": -5.057692050933838, "objective/rlhf_reward": -18.780170063586578, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 78.39713287353516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7395067811012268, "step": 902, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985625743865967 }, { "episode": 14464, "epoch": 0.2599849013193371, "loss/policy_avg": 1.0716335773468018, "lr": 9.42292944785276e-06, "objective/entropy": -12.325759887695312, "objective/kl": 50.18354415893555, "objective/non_score_reward": -5.018354415893555, "objective/rlhf_reward": -18.622819285006866, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 11.29092025756836, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5627316236495972, "step": 903, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985690116882324 }, { "episode": 14480, "epoch": 0.26027249523672574, "loss/policy_avg": 0.7096864581108093, "lr": 9.422290388548057e-06, "objective/entropy": 128.26751708984375, "objective/kl": 47.753265380859375, "objective/non_score_reward": -4.775326728820801, "objective/rlhf_reward": -17.58553596714371, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 14.658417701721191, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7661948204040527, "step": 904, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9982168674468994 }, { "episode": 14496, "epoch": 0.2605600891541144, "loss/policy_avg": 0.7332016229629517, "lr": 9.421651329243354e-06, "objective/entropy": -9.548530578613281, "objective/kl": 35.575138092041016, "objective/non_score_reward": -3.557513952255249, "objective/rlhf_reward": -12.405227299007485, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 9.975175857543945, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7684756517410278, "step": 905, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979183673858643 }, { "episode": 14512, "epoch": 0.260847683071503, "loss/policy_avg": 0.16122013330459595, "lr": 9.421012269938651e-06, "objective/entropy": 57.9202880859375, "objective/kl": 40.8089599609375, "objective/non_score_reward": -4.080896377563477, "objective/rlhf_reward": -14.719464097086508, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 33.88508605957031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3608490824699402, "step": 906, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9993462562561035 }, { "episode": 14528, "epoch": 0.26113527698889166, "loss/policy_avg": 2.275667190551758, "lr": 9.420373210633948e-06, "objective/entropy": -2.799551010131836, "objective/kl": 57.71617126464844, "objective/non_score_reward": -5.7716169357299805, "objective/rlhf_reward": -18.686467742919923, "objective/scores": 1.1, "policy/approxkl_avg": 110.55852508544922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5820337533950806, "step": 907, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986871480941772 }, { "episode": 14544, "epoch": 0.26142287090628036, "loss/policy_avg": 0.9861481785774231, "lr": 9.419734151329245e-06, "objective/entropy": 91.0115737915039, "objective/kl": 45.30067825317383, "objective/non_score_reward": -4.5300679206848145, "objective/rlhf_reward": -16.741669514266352, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 6.376153469085693, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.39600497484207153, "step": 908, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0001068115234375 }, { "episode": 14560, "epoch": 0.261710464823669, "loss/policy_avg": 1.1333844661712646, "lr": 9.41909509202454e-06, "objective/entropy": -235.7226104736328, "objective/kl": 30.271202087402344, "objective/non_score_reward": -3.027120351791382, "objective/rlhf_reward": -10.70848128795624, "objective/scores": 0.35, "policy/approxkl_avg": 32.180904388427734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.667456865310669, "step": 909, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9989008903503418 }, { "episode": 14576, "epoch": 0.26199805874105764, "loss/policy_avg": 0.17538058757781982, "lr": 9.418456032719837e-06, "objective/entropy": 134.0635986328125, "objective/kl": 42.076332092285156, "objective/non_score_reward": -4.2076334953308105, "objective/rlhf_reward": -15.48889785101953, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 32.24264144897461, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.513163685798645, "step": 910, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992715120315552 }, { "episode": 14592, "epoch": 0.2622856526584463, "loss/policy_avg": -0.15949714183807373, "lr": 9.417816973415134e-06, "objective/entropy": -46.453277587890625, "objective/kl": 38.9816780090332, "objective/non_score_reward": -3.8981680870056152, "objective/rlhf_reward": -13.645260284619269, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 21.381032943725586, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7189059257507324, "step": 911, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00034761428833 }, { "episode": 14608, "epoch": 0.2625732465758349, "loss/policy_avg": 0.20729732513427734, "lr": 9.41717791411043e-06, "objective/entropy": -49.87384033203125, "objective/kl": 42.82794952392578, "objective/non_score_reward": -4.282794952392578, "objective/rlhf_reward": -15.527060303751547, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.6806117296218872, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6336564421653748, "step": 912, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001604557037354 }, { "episode": 14624, "epoch": 0.26286084049322356, "loss/policy_avg": -0.13027964532375336, "lr": 9.416538854805727e-06, "objective/entropy": -235.79547119140625, "objective/kl": 46.248512268066406, "objective/non_score_reward": -4.624851226806641, "objective/rlhf_reward": -17.14015623304693, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 54.118743896484375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5618636012077332, "step": 913, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989900588989258 }, { "episode": 14640, "epoch": 0.2631484344106122, "loss/policy_avg": 0.32277536392211914, "lr": 9.415899795501023e-06, "objective/entropy": 57.70044708251953, "objective/kl": 59.526153564453125, "objective/non_score_reward": -5.952615737915039, "objective/rlhf_reward": -22.468827298193602, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 20.54634666442871, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7771313190460205, "step": 914, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9986742734909058 }, { "episode": 14656, "epoch": 0.26343602832800084, "loss/policy_avg": 0.6588897705078125, "lr": 9.41526073619632e-06, "objective/entropy": -144.39913940429688, "objective/kl": 53.891265869140625, "objective/non_score_reward": -5.389126777648926, "objective/rlhf_reward": -20.230992827445192, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 29.271865844726562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46916764974594116, "step": 915, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994072914123535 }, { "episode": 14672, "epoch": 0.26372362224538953, "loss/policy_avg": 0.040606118738651276, "lr": 9.414621676891616e-06, "objective/entropy": -236.26051330566406, "objective/kl": 35.742431640625, "objective/non_score_reward": -3.5742433071136475, "objective/rlhf_reward": -12.781201684268648, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 4.916508674621582, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5409867167472839, "step": 916, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002603769302368 }, { "episode": 14688, "epoch": 0.2640112161627782, "loss/policy_avg": 0.4728504419326782, "lr": 9.413982617586913e-06, "objective/entropy": -3.219512939453125, "objective/kl": 52.13475036621094, "objective/non_score_reward": -5.213474750518799, "objective/rlhf_reward": -19.47529683360229, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 11.293510437011719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7324811220169067, "step": 917, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995346069335938 }, { "episode": 14704, "epoch": 0.2642988100801668, "loss/policy_avg": 0.7524695992469788, "lr": 9.41334355828221e-06, "objective/entropy": 69.59600830078125, "objective/kl": 40.76435852050781, "objective/non_score_reward": -4.0764360427856445, "objective/rlhf_reward": -14.358331988530097, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 22.529842376708984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7941198348999023, "step": 918, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9969267845153809 }, { "episode": 14720, "epoch": 0.26458640399755545, "loss/policy_avg": 1.3578912019729614, "lr": 9.412704498977507e-06, "objective/entropy": -154.57412719726562, "objective/kl": 49.238441467285156, "objective/non_score_reward": -4.923844337463379, "objective/rlhf_reward": -19.695377826690674, "objective/scores": 0.0, "policy/approxkl_avg": 3.7411012649536133, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6235677003860474, "step": 919, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.004084348678589 }, { "episode": 14736, "epoch": 0.2648739979149441, "loss/policy_avg": 0.38867104053497314, "lr": 9.412065439672802e-06, "objective/entropy": -79.98590850830078, "objective/kl": 41.547691345214844, "objective/non_score_reward": -4.1547698974609375, "objective/rlhf_reward": -14.88574506441752, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 137.96481323242188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.2868984043598175, "step": 920, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999016523361206 }, { "episode": 14752, "epoch": 0.26516159183233273, "loss/policy_avg": 0.6230953931808472, "lr": 9.411426380368099e-06, "objective/entropy": -143.03982543945312, "objective/kl": 49.02768325805664, "objective/non_score_reward": -4.902768611907959, "objective/rlhf_reward": -17.488367976919683, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 4.535545349121094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5886020660400391, "step": 921, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999190330505371 }, { "episode": 14768, "epoch": 0.2654491857497214, "loss/policy_avg": -0.35152608156204224, "lr": 9.410787321063396e-06, "objective/entropy": -254.19827270507812, "objective/kl": 39.34025573730469, "objective/non_score_reward": -3.934025764465332, "objective/rlhf_reward": -13.788691113667426, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 18.294979095458984, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6331444978713989, "step": 922, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0006368160247803 }, { "episode": 14784, "epoch": 0.26573677966711007, "loss/policy_avg": 1.660903811454773, "lr": 9.410148261758691e-06, "objective/entropy": 51.40530014038086, "objective/kl": 54.351776123046875, "objective/non_score_reward": -5.435177326202393, "objective/rlhf_reward": -20.224937999042208, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 30.1667423248291, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.40519797801971436, "step": 923, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972550868988037 }, { "episode": 14800, "epoch": 0.2660243735844987, "loss/policy_avg": -0.4044073820114136, "lr": 9.409509202453988e-06, "objective/entropy": 100.78778839111328, "objective/kl": 55.377384185791016, "objective/non_score_reward": -5.537738800048828, "objective/rlhf_reward": -20.77235255488525, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.32742178440093994, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5611143708229065, "step": 924, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0023300647735596 }, { "episode": 14816, "epoch": 0.26631196750188735, "loss/policy_avg": 0.9976423382759094, "lr": 9.408870143149285e-06, "objective/entropy": -75.03382873535156, "objective/kl": 41.136863708496094, "objective/non_score_reward": -4.113686561584473, "objective/rlhf_reward": -15.129233155280275, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 12.370454788208008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3627585470676422, "step": 925, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000096559524536 }, { "episode": 14832, "epoch": 0.266599561419276, "loss/policy_avg": 0.8356518745422363, "lr": 9.408231083844582e-06, "objective/entropy": -184.39404296875, "objective/kl": 42.4769401550293, "objective/non_score_reward": -4.24769401550293, "objective/rlhf_reward": -15.386656556192953, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 18.93747329711914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49698585271835327, "step": 926, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9984275102615356 }, { "episode": 14848, "epoch": 0.2668871553366646, "loss/policy_avg": 0.9323163628578186, "lr": 9.407592024539877e-06, "objective/entropy": -184.23748779296875, "objective/kl": 45.12583923339844, "objective/non_score_reward": -4.512584209442139, "objective/rlhf_reward": -18.050336837768555, "objective/scores": 0.0, "policy/approxkl_avg": 3.957935333251953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6059246063232422, "step": 927, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981895685195923 }, { "episode": 14864, "epoch": 0.26717474925405327, "loss/policy_avg": 1.5501456260681152, "lr": 9.406952965235174e-06, "objective/entropy": -222.2357635498047, "objective/kl": 45.133087158203125, "objective/non_score_reward": -4.513309001922607, "objective/rlhf_reward": -16.602638105960235, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 5.245251655578613, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.663169264793396, "step": 928, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.002445697784424 }, { "episode": 14880, "epoch": 0.2674623431714419, "loss/policy_avg": 0.06294722855091095, "lr": 9.40631390593047e-06, "objective/entropy": -219.2621307373047, "objective/kl": 35.440128326416016, "objective/non_score_reward": -3.544013023376465, "objective/rlhf_reward": -12.797449925032954, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 153.74978637695312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46772995591163635, "step": 929, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9976612329483032 }, { "episode": 14896, "epoch": 0.26774993708883055, "loss/policy_avg": 1.227535605430603, "lr": 9.405674846625768e-06, "objective/entropy": -223.289794921875, "objective/kl": 52.25310516357422, "objective/non_score_reward": -5.225310325622559, "objective/rlhf_reward": -18.501241779327394, "objective/scores": 0.6, "policy/approxkl_avg": 27.705047607421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7249962091445923, "step": 930, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999952793121338 }, { "episode": 14912, "epoch": 0.26803753100621924, "loss/policy_avg": 0.8206846714019775, "lr": 9.405035787321065e-06, "objective/entropy": -198.30308532714844, "objective/kl": 39.47509002685547, "objective/non_score_reward": -3.947509527206421, "objective/rlhf_reward": -14.390037870407106, "objective/scores": 0.35, "policy/approxkl_avg": 59.78777313232422, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5694193840026855, "step": 931, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9995026588439941 }, { "episode": 14928, "epoch": 0.2683251249236079, "loss/policy_avg": 0.9043534994125366, "lr": 9.404396728016361e-06, "objective/entropy": -184.1514434814453, "objective/kl": 29.442230224609375, "objective/non_score_reward": -2.944223165512085, "objective/rlhf_reward": -8.853173409343931, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.002543926239014, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4746544361114502, "step": 932, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0039162635803223 }, { "episode": 14944, "epoch": 0.2686127188409965, "loss/policy_avg": 0.00976651906967163, "lr": 9.403757668711657e-06, "objective/entropy": -182.20498657226562, "objective/kl": 47.182945251464844, "objective/non_score_reward": -4.718294620513916, "objective/rlhf_reward": -14.473178243637086, "objective/scores": 1.1, "policy/approxkl_avg": 5.810610294342041, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7361587285995483, "step": 933, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00186824798584 }, { "episode": 14960, "epoch": 0.26890031275838516, "loss/policy_avg": 1.3497049808502197, "lr": 9.403118609406953e-06, "objective/entropy": -47.7762451171875, "objective/kl": 49.33927536010742, "objective/non_score_reward": -4.933927536010742, "objective/rlhf_reward": -18.41019752982251, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.110077857971191, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.627051830291748, "step": 934, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001015663146973 }, { "episode": 14976, "epoch": 0.2691879066757738, "loss/policy_avg": 0.9171154499053955, "lr": 9.40247955010225e-06, "objective/entropy": -59.27357482910156, "objective/kl": 37.14779281616211, "objective/non_score_reward": -3.7147793769836426, "objective/rlhf_reward": -13.480515101043085, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 52.2471809387207, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5264706611633301, "step": 935, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000478982925415 }, { "episode": 14992, "epoch": 0.26947550059316244, "loss/policy_avg": 0.06482569873332977, "lr": 9.401840490797547e-06, "objective/entropy": -391.59271240234375, "objective/kl": 33.689430236816406, "objective/non_score_reward": -3.368943214416504, "objective/rlhf_reward": -12.11652275297491, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.8014378547668457, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.760464072227478, "step": 936, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9995503425598145 }, { "episode": 15008, "epoch": 0.2697630945105511, "loss/policy_avg": 0.03722277283668518, "lr": 9.401201431492844e-06, "objective/entropy": -255.18324279785156, "objective/kl": 55.505332946777344, "objective/non_score_reward": -5.550533771514893, "objective/rlhf_reward": -20.823532440749506, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 39.2294921875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.66856449842453, "step": 937, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9981707334518433 }, { "episode": 15024, "epoch": 0.2700506884279397, "loss/policy_avg": 0.4313279092311859, "lr": 9.40056237218814e-06, "objective/entropy": -258.69293212890625, "objective/kl": 38.366519927978516, "objective/non_score_reward": -3.8366520404815674, "objective/rlhf_reward": -13.922776301105586, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 35.569087982177734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8422372341156006, "step": 938, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9997689723968506 }, { "episode": 15040, "epoch": 0.2703382823453284, "loss/policy_avg": 0.5069754123687744, "lr": 9.399923312883436e-06, "objective/entropy": -305.361083984375, "objective/kl": 45.78974151611328, "objective/non_score_reward": -4.57897424697876, "objective/rlhf_reward": -16.93729529627929, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 18.944904327392578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.30599963665008545, "step": 939, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998727560043335 }, { "episode": 15056, "epoch": 0.27062587626271706, "loss/policy_avg": 0.995228111743927, "lr": 9.399284253578733e-06, "objective/entropy": 184.52796936035156, "objective/kl": 53.43628692626953, "objective/non_score_reward": -5.343628883361816, "objective/rlhf_reward": -16.974516487121583, "objective/scores": 1.1, "policy/approxkl_avg": 65.83000183105469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4442574977874756, "step": 940, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986300468444824 }, { "episode": 15072, "epoch": 0.2709134701801057, "loss/policy_avg": 0.5919173955917358, "lr": 9.39864519427403e-06, "objective/entropy": -17.191753387451172, "objective/kl": 56.80730438232422, "objective/non_score_reward": -5.680730819702148, "objective/rlhf_reward": -18.32292232513428, "objective/scores": 1.1, "policy/approxkl_avg": 3.7202348709106445, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.762370228767395, "step": 941, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9992384910583496 }, { "episode": 15088, "epoch": 0.27120106409749434, "loss/policy_avg": 1.1367782354354858, "lr": 9.398006134969327e-06, "objective/entropy": 24.78857421875, "objective/kl": 50.70988464355469, "objective/non_score_reward": -5.07098913192749, "objective/rlhf_reward": -18.883956527709962, "objective/scores": 0.35, "policy/approxkl_avg": 7.020835876464844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6023867130279541, "step": 942, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999258041381836 }, { "episode": 15104, "epoch": 0.271488658014883, "loss/policy_avg": 0.28944456577301025, "lr": 9.397367075664624e-06, "objective/entropy": 38.79130172729492, "objective/kl": 51.324073791503906, "objective/non_score_reward": -5.132407188415527, "objective/rlhf_reward": -19.20411637786023, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 18.69689178466797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4386768937110901, "step": 943, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981820583343506 }, { "episode": 15120, "epoch": 0.2717762519322716, "loss/policy_avg": 0.04539201408624649, "lr": 9.396728016359919e-06, "objective/entropy": -393.70343017578125, "objective/kl": 29.64443016052246, "objective/non_score_reward": -2.9644432067871094, "objective/rlhf_reward": -11.8577721118927, "objective/scores": 0.0, "policy/approxkl_avg": 12.188756942749023, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7838531136512756, "step": 944, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9975636005401611 }, { "episode": 15136, "epoch": 0.27206384584966026, "loss/policy_avg": -0.8645926713943481, "lr": 9.396088957055216e-06, "objective/entropy": -124.75596618652344, "objective/kl": 37.55431365966797, "objective/non_score_reward": -3.7554311752319336, "objective/rlhf_reward": -13.41760495669039, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 30.16514778137207, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9663676023483276, "step": 945, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.009145498275757 }, { "episode": 15152, "epoch": 0.27235143976704895, "loss/policy_avg": -0.9885836839675903, "lr": 9.395449897750511e-06, "objective/entropy": 23.736434936523438, "objective/kl": 47.477176666259766, "objective/non_score_reward": -4.74771785736084, "objective/rlhf_reward": -17.329011922300445, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 21.30612564086914, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6642924547195435, "step": 946, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0021979808807373 }, { "episode": 15168, "epoch": 0.2726390336844376, "loss/policy_avg": 0.5419988632202148, "lr": 9.394810838445808e-06, "objective/entropy": -163.80274963378906, "objective/kl": 49.423614501953125, "objective/non_score_reward": -4.942361831665039, "objective/rlhf_reward": -18.42781191161218, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 63.652008056640625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7244300842285156, "step": 947, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981995820999146 }, { "episode": 15184, "epoch": 0.27292662760182623, "loss/policy_avg": -0.12660138309001923, "lr": 9.394171779141105e-06, "objective/entropy": -329.560302734375, "objective/kl": 39.731605529785156, "objective/non_score_reward": -3.9731602668762207, "objective/rlhf_reward": -14.514039137450556, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 17.309505462646484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6581008434295654, "step": 948, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9999020099639893 }, { "episode": 15200, "epoch": 0.2732142215192149, "loss/policy_avg": -0.06109565496444702, "lr": 9.393532719836402e-06, "objective/entropy": -217.1125946044922, "objective/kl": 47.109092712402344, "objective/non_score_reward": -4.710909366607666, "objective/rlhf_reward": -17.46503529795776, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 93.11270141601562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6009324193000793, "step": 949, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982292652130127 }, { "episode": 15216, "epoch": 0.2735018154366035, "loss/policy_avg": 0.6483868956565857, "lr": 9.392893660531698e-06, "objective/entropy": -35.96177673339844, "objective/kl": 52.633846282958984, "objective/non_score_reward": -5.263384819030762, "objective/rlhf_reward": -21.05353856086731, "objective/scores": 0.0, "policy/approxkl_avg": 8.303674697875977, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5491381883621216, "step": 950, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0011985301971436 }, { "episode": 15232, "epoch": 0.27378940935399215, "loss/policy_avg": 1.7503316402435303, "lr": 9.392254601226994e-06, "objective/entropy": -13.0999755859375, "objective/kl": 52.63844299316406, "objective/non_score_reward": -5.2638444900512695, "objective/rlhf_reward": -18.932671727911504, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 6.220101833343506, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6228674650192261, "step": 951, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985336065292358 }, { "episode": 15248, "epoch": 0.2740770032713808, "loss/policy_avg": -0.4814819097518921, "lr": 9.39161554192229e-06, "objective/entropy": -400.868408203125, "objective/kl": 46.192840576171875, "objective/non_score_reward": -4.619284152984619, "objective/rlhf_reward": -17.077136611938478, "objective/scores": 0.35, "policy/approxkl_avg": 7.3005571365356445, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.3778308033943176, "step": 952, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.002164602279663 }, { "episode": 15264, "epoch": 0.27436459718876943, "loss/policy_avg": 0.23375429213047028, "lr": 9.390976482617587e-06, "objective/entropy": -265.75054931640625, "objective/kl": 36.87724685668945, "objective/non_score_reward": -3.6877243518829346, "objective/rlhf_reward": -14.750897645950317, "objective/scores": 0.0, "policy/approxkl_avg": 1.4932501316070557, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8672538995742798, "step": 953, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0001888275146484 }, { "episode": 15280, "epoch": 0.27465219110615813, "loss/policy_avg": 0.3528614044189453, "lr": 9.390337423312884e-06, "objective/entropy": -80.12568664550781, "objective/kl": 54.801692962646484, "objective/non_score_reward": -5.480169296264648, "objective/rlhf_reward": -20.595164570838136, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.5583480596542358, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.41291695833206177, "step": 954, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998396635055542 }, { "episode": 15296, "epoch": 0.27493978502354677, "loss/policy_avg": 1.2501368522644043, "lr": 9.389698364008181e-06, "objective/entropy": -262.31793212890625, "objective/kl": 40.21039581298828, "objective/non_score_reward": -4.0210394859313965, "objective/rlhf_reward": -13.160439883114073, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 41.2574462890625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5917608141899109, "step": 955, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9988981485366821 }, { "episode": 15312, "epoch": 0.2752273789409354, "loss/policy_avg": 3.28550386428833, "lr": 9.389059304703478e-06, "objective/entropy": -228.41085815429688, "objective/kl": 46.59629440307617, "objective/non_score_reward": -4.6596293449401855, "objective/rlhf_reward": -17.313004527121706, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.093324661254883, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5825387239456177, "step": 956, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00113582611084 }, { "episode": 15328, "epoch": 0.27551497285832405, "loss/policy_avg": 0.926065981388092, "lr": 9.388420245398773e-06, "objective/entropy": -161.94918823242188, "objective/kl": 40.23236083984375, "objective/non_score_reward": -4.023235321044922, "objective/rlhf_reward": -13.692942714691164, "objective/scores": 0.6, "policy/approxkl_avg": 36.19329833984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5999109148979187, "step": 957, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.995762586593628 }, { "episode": 15344, "epoch": 0.2758025667757127, "loss/policy_avg": 0.9499435424804688, "lr": 9.38778118609407e-06, "objective/entropy": -131.8387451171875, "objective/kl": 52.344879150390625, "objective/non_score_reward": -5.234487533569336, "objective/rlhf_reward": -19.514119465549555, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 97.26606750488281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5245336294174194, "step": 958, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9986298084259033 }, { "episode": 15360, "epoch": 0.2760901606931013, "loss/policy_avg": -0.003145521506667137, "lr": 9.387142126789367e-06, "objective/entropy": -165.37252807617188, "objective/kl": 54.373069763183594, "objective/non_score_reward": -5.437307357788086, "objective/rlhf_reward": -18.82550874793646, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 13.774616241455078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5953001976013184, "step": 959, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0011730194091797 }, { "episode": 15376, "epoch": 0.27637775461048997, "loss/policy_avg": 0.4764801859855652, "lr": 9.386503067484664e-06, "objective/entropy": -242.42982482910156, "objective/kl": 56.915626525878906, "objective/non_score_reward": -5.691562652587891, "objective/rlhf_reward": -22.766250133514404, "objective/scores": 0.0, "policy/approxkl_avg": 29.58062744140625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6144382953643799, "step": 960, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9959168434143066 }, { "episode": 15392, "epoch": 0.27666534852787866, "loss/policy_avg": 1.376183271408081, "lr": 9.38586400817996e-06, "objective/entropy": -322.4050598144531, "objective/kl": 36.3888053894043, "objective/non_score_reward": -3.638880729675293, "objective/rlhf_reward": -13.213887742071776, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 44.334632873535156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.613652765750885, "step": 961, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9986045360565186 }, { "episode": 15408, "epoch": 0.2769529424452673, "loss/policy_avg": 1.6320432424545288, "lr": 9.385224948875256e-06, "objective/entropy": -79.29718017578125, "objective/kl": 59.607242584228516, "objective/non_score_reward": -5.960724353790283, "objective/rlhf_reward": -22.51738360884778, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 87.03724670410156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5909746885299683, "step": 962, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998765468597412 }, { "episode": 15424, "epoch": 0.27724053636265594, "loss/policy_avg": 2.325925588607788, "lr": 9.384585889570553e-06, "objective/entropy": -113.67591857910156, "objective/kl": 55.53913116455078, "objective/non_score_reward": -5.553913116455078, "objective/rlhf_reward": -20.482319132486978, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 8.882341384887695, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6297260522842407, "step": 963, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9990952014923096 }, { "episode": 15440, "epoch": 0.2775281302800446, "loss/policy_avg": 0.5869482159614563, "lr": 9.38394683026585e-06, "objective/entropy": -206.8366241455078, "objective/kl": 48.33332824707031, "objective/non_score_reward": -4.833332538604736, "objective/rlhf_reward": -14.933330869674684, "objective/scores": 1.1, "policy/approxkl_avg": 49.717071533203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7187210917472839, "step": 964, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9980008602142334 }, { "episode": 15456, "epoch": 0.2778157241974332, "loss/policy_avg": 1.1654845476150513, "lr": 9.383307770961147e-06, "objective/entropy": -379.75067138671875, "objective/kl": 57.3199462890625, "objective/non_score_reward": -5.73199462890625, "objective/rlhf_reward": -20.8052737138429, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 37.82645034790039, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5788445472717285, "step": 965, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9972472190856934 }, { "episode": 15472, "epoch": 0.27810331811482186, "loss/policy_avg": 0.9802603721618652, "lr": 9.382668711656443e-06, "objective/entropy": 101.25556945800781, "objective/kl": 61.29494857788086, "objective/non_score_reward": -6.129494667053223, "objective/rlhf_reward": -21.5942603691828, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 45.391502380371094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5674448013305664, "step": 966, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998470544815063 }, { "episode": 15488, "epoch": 0.2783909120322105, "loss/policy_avg": 1.8580645322799683, "lr": 9.382029652351739e-06, "objective/entropy": -102.23580932617188, "objective/kl": 37.78134536743164, "objective/non_score_reward": -3.77813458442688, "objective/rlhf_reward": -13.787025485068483, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.6718518733978271, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3454323410987854, "step": 967, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001361131668091 }, { "episode": 15504, "epoch": 0.27867850594959914, "loss/policy_avg": 0.8354310393333435, "lr": 9.381390593047035e-06, "objective/entropy": -306.50030517578125, "objective/kl": 31.03956413269043, "objective/non_score_reward": -3.103956460952759, "objective/rlhf_reward": -10.859566538539484, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 0.7312620878219604, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.37779200077056885, "step": 968, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000714063644409 }, { "episode": 15520, "epoch": 0.27896609986698784, "loss/policy_avg": 0.009227469563484192, "lr": 9.380751533742332e-06, "objective/entropy": -336.61578369140625, "objective/kl": 41.42436599731445, "objective/non_score_reward": -4.142436504364014, "objective/rlhf_reward": -15.119147877307281, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.629606246948242, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.3866472542285919, "step": 969, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.999985933303833 }, { "episode": 15536, "epoch": 0.2792536937843765, "loss/policy_avg": -0.041605472564697266, "lr": 9.380112474437628e-06, "objective/entropy": -282.89971923828125, "objective/kl": 48.89763259887695, "objective/non_score_reward": -4.889763355255127, "objective/rlhf_reward": -17.436346950308355, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 5.264449596405029, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6422872543334961, "step": 970, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9998466968536377 }, { "episode": 15552, "epoch": 0.2795412877017651, "loss/policy_avg": 0.36232057213783264, "lr": 9.379473415132924e-06, "objective/entropy": -300.8648681640625, "objective/kl": 41.13935852050781, "objective/non_score_reward": -4.113935470581055, "objective/rlhf_reward": -15.130230460196657, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 50.30149841308594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5609759092330933, "step": 971, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985394477844238 }, { "episode": 15568, "epoch": 0.27982888161915376, "loss/policy_avg": 0.5932891964912415, "lr": 9.378834355828221e-06, "objective/entropy": -174.238037109375, "objective/kl": 50.591854095458984, "objective/non_score_reward": -5.059185981750488, "objective/rlhf_reward": -18.7861455484346, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 36.18544006347656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4679606854915619, "step": 972, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989001750946045 }, { "episode": 15584, "epoch": 0.2801164755365424, "loss/policy_avg": 1.0932896137237549, "lr": 9.378195296523518e-06, "objective/entropy": -144.99090576171875, "objective/kl": 58.75678253173828, "objective/non_score_reward": -5.875679016113281, "objective/rlhf_reward": -22.021762016232373, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 71.56244659423828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5289046764373779, "step": 973, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984965324401855 }, { "episode": 15600, "epoch": 0.28040406945393104, "loss/policy_avg": 0.6273190379142761, "lr": 9.377556237218815e-06, "objective/entropy": -122.2640151977539, "objective/kl": 44.235450744628906, "objective/non_score_reward": -4.423544883728027, "objective/rlhf_reward": -17.694178819656372, "objective/scores": 0.0, "policy/approxkl_avg": 11.347247123718262, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3611798584461212, "step": 974, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999008297920227 }, { "episode": 15616, "epoch": 0.2806916633713197, "loss/policy_avg": 0.09601998329162598, "lr": 9.37691717791411e-06, "objective/entropy": -131.6696014404297, "objective/kl": 48.32147979736328, "objective/non_score_reward": -4.832147598266602, "objective/rlhf_reward": -17.81282004097336, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 7.570517063140869, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5157725811004639, "step": 975, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0004305839538574 }, { "episode": 15632, "epoch": 0.2809792572887083, "loss/policy_avg": -0.13817770779132843, "lr": 9.376278118609407e-06, "objective/entropy": -255.62921142578125, "objective/kl": 53.164588928222656, "objective/non_score_reward": -5.316458702087402, "objective/rlhf_reward": -19.750063502582247, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.5946006774902344, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5641098618507385, "step": 976, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0020365715026855 }, { "episode": 15648, "epoch": 0.281266851206097, "loss/policy_avg": 1.142382264137268, "lr": 9.375639059304704e-06, "objective/entropy": -25.651283264160156, "objective/kl": 67.4949722290039, "objective/non_score_reward": -6.749497413635254, "objective/rlhf_reward": -25.619387486068113, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.648348808288574, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6707190275192261, "step": 977, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988956451416016 }, { "episode": 15664, "epoch": 0.28155444512348565, "loss/policy_avg": 0.6753113269805908, "lr": 9.375000000000001e-06, "objective/entropy": -391.8768005371094, "objective/kl": 44.86485290527344, "objective/non_score_reward": -4.486485481262207, "objective/rlhf_reward": -13.545940971374513, "objective/scores": 1.1, "policy/approxkl_avg": 6.812070846557617, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8541865348815918, "step": 978, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 17, "val/ratio": 2.000732183456421 }, { "episode": 15680, "epoch": 0.2818420390408743, "loss/policy_avg": 0.3577782213687897, "lr": 9.374360940695298e-06, "objective/entropy": -254.20706176757812, "objective/kl": 47.57148742675781, "objective/non_score_reward": -4.757148742675781, "objective/rlhf_reward": -17.203765745433877, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 93.71537780761719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484947800636292, "step": 979, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985871315002441 }, { "episode": 15696, "epoch": 0.28212963295826293, "loss/policy_avg": -0.1328345537185669, "lr": 9.373721881390595e-06, "objective/entropy": 28.385881423950195, "objective/kl": 49.55193328857422, "objective/non_score_reward": -4.955193042755127, "objective/rlhf_reward": -18.44217047938476, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 10.326236724853516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7927839159965515, "step": 980, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0010528564453125 }, { "episode": 15712, "epoch": 0.2824172268756516, "loss/policy_avg": 0.12570828199386597, "lr": 9.37308282208589e-06, "objective/entropy": -141.36134338378906, "objective/kl": 54.92476272583008, "objective/non_score_reward": -5.492476463317871, "objective/rlhf_reward": -20.454133355411226, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 3.49436354637146, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5999050140380859, "step": 981, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0009312629699707 }, { "episode": 15728, "epoch": 0.2827048207930402, "loss/policy_avg": 2.4626569747924805, "lr": 9.372443762781187e-06, "objective/entropy": -266.347900390625, "objective/kl": 47.33668899536133, "objective/non_score_reward": -4.733669281005859, "objective/rlhf_reward": -17.330556426111777, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.0285089015960693, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5679370164871216, "step": 982, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0021414756774902 }, { "episode": 15744, "epoch": 0.28299241471042885, "loss/policy_avg": 0.1242997869849205, "lr": 9.371804703476484e-06, "objective/entropy": -346.93878173828125, "objective/kl": 35.437347412109375, "objective/non_score_reward": -3.5437347888946533, "objective/rlhf_reward": -11.774938917160034, "objective/scores": 0.6, "policy/approxkl_avg": 42.03936004638672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.617620587348938, "step": 983, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993083477020264 }, { "episode": 15760, "epoch": 0.28328000862781755, "loss/policy_avg": 0.14840182662010193, "lr": 9.37116564417178e-06, "objective/entropy": -368.4073486328125, "objective/kl": 28.31252670288086, "objective/non_score_reward": -2.8312525749206543, "objective/rlhf_reward": -9.202304425016914, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.93632698059082, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6155978441238403, "step": 984, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9987332820892334 }, { "episode": 15776, "epoch": 0.2835676025452062, "loss/policy_avg": 0.47417038679122925, "lr": 9.370526584867077e-06, "objective/entropy": -141.45626831054688, "objective/kl": 50.39579772949219, "objective/non_score_reward": -5.039579391479492, "objective/rlhf_reward": -18.79906793806402, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 97.74449157714844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5000198483467102, "step": 985, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001434803009033 }, { "episode": 15792, "epoch": 0.2838551964625948, "loss/policy_avg": 1.3729290962219238, "lr": 9.369887525562373e-06, "objective/entropy": -411.28570556640625, "objective/kl": 35.94482421875, "objective/non_score_reward": -3.594482421875, "objective/rlhf_reward": -9.97793016433716, "objective/scores": 1.1, "policy/approxkl_avg": 10.14614486694336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6557059288024902, "step": 986, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0011909008026123 }, { "episode": 15808, "epoch": 0.28414279037998347, "loss/policy_avg": 3.4433889389038086, "lr": 9.36924846625767e-06, "objective/entropy": -201.43887329101562, "objective/kl": 51.010643005371094, "objective/non_score_reward": -5.1010637283325195, "objective/rlhf_reward": -18.28155011154798, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 24.718257904052734, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6495320200920105, "step": 987, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9984331130981445 }, { "episode": 15824, "epoch": 0.2844303842973721, "loss/policy_avg": 2.152106761932373, "lr": 9.368609406952966e-06, "objective/entropy": -109.68426513671875, "objective/kl": 36.905975341796875, "objective/non_score_reward": -3.6905977725982666, "objective/rlhf_reward": -13.206131546702935, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.616457939147949, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.632394552230835, "step": 988, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9983766078948975 }, { "episode": 15840, "epoch": 0.28471797821476075, "loss/policy_avg": 0.4088176488876343, "lr": 9.367970347648263e-06, "objective/entropy": -241.9044647216797, "objective/kl": 47.368995666503906, "objective/non_score_reward": -4.736899375915527, "objective/rlhf_reward": -17.58834787580816, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.87666654586792, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7490575313568115, "step": 989, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9989919662475586 }, { "episode": 15856, "epoch": 0.2850055721321494, "loss/policy_avg": 6.076893329620361, "lr": 9.367331288343558e-06, "objective/entropy": -96.00433349609375, "objective/kl": 45.789695739746094, "objective/non_score_reward": -4.578969955444336, "objective/rlhf_reward": -15.39215913856146, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 15.434422492980957, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5452468991279602, "step": 990, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0014736652374268 }, { "episode": 15872, "epoch": 0.285293166049538, "loss/policy_avg": -0.9232574701309204, "lr": 9.366692229038855e-06, "objective/entropy": -144.09912109375, "objective/kl": 46.76097869873047, "objective/non_score_reward": -4.676098346710205, "objective/rlhf_reward": -17.345143043731134, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.897438645362854, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6364270448684692, "step": 991, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0024867057800293 }, { "episode": 15888, "epoch": 0.2855807599669267, "loss/policy_avg": 0.041624486446380615, "lr": 9.366053169734152e-06, "objective/entropy": -139.56771850585938, "objective/kl": 38.63270950317383, "objective/non_score_reward": -3.8632712364196777, "objective/rlhf_reward": -11.05308494567871, "objective/scores": 1.1, "policy/approxkl_avg": 9.528964042663574, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6615604162216187, "step": 992, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002514600753784 }, { "episode": 15904, "epoch": 0.28586835388431536, "loss/policy_avg": 0.31544625759124756, "lr": 9.365414110429449e-06, "objective/entropy": -208.21640014648438, "objective/kl": 53.08674240112305, "objective/non_score_reward": -5.308674335479736, "objective/rlhf_reward": -19.834697103500368, "objective/scores": 0.35, "policy/approxkl_avg": 6.563029766082764, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6068991422653198, "step": 993, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9984232187271118 }, { "episode": 15920, "epoch": 0.286155947801704, "loss/policy_avg": -0.1134636178612709, "lr": 9.364775051124744e-06, "objective/entropy": -341.1083984375, "objective/kl": 38.53087615966797, "objective/non_score_reward": -3.8530876636505127, "objective/rlhf_reward": -13.808230671946127, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.8780159950256348, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47948139905929565, "step": 994, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.002434015274048 }, { "episode": 15936, "epoch": 0.28644354171909264, "loss/policy_avg": 0.7322211265563965, "lr": 9.364135991820041e-06, "objective/entropy": -335.6710510253906, "objective/kl": 40.26041793823242, "objective/non_score_reward": -4.0260419845581055, "objective/rlhf_reward": -14.588396155627901, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 4.264369010925293, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6806702017784119, "step": 995, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0014522075653076 }, { "episode": 15952, "epoch": 0.2867311356364813, "loss/policy_avg": 0.9409044981002808, "lr": 9.363496932515338e-06, "objective/entropy": -169.52256774902344, "objective/kl": 33.46562576293945, "objective/non_score_reward": -3.346562385559082, "objective/rlhf_reward": -10.98624954223633, "objective/scores": 0.6, "policy/approxkl_avg": 16.522533416748047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.552007794380188, "step": 996, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994235038757324 }, { "episode": 15968, "epoch": 0.2870187295538699, "loss/policy_avg": 0.1874466985464096, "lr": 9.362857873210635e-06, "objective/entropy": 49.27545928955078, "objective/kl": 53.142845153808594, "objective/non_score_reward": -5.314284324645996, "objective/rlhf_reward": -19.653018269602377, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 20.86511993408203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4496247172355652, "step": 997, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9997801780700684 }, { "episode": 15984, "epoch": 0.28730632347125856, "loss/policy_avg": 0.2505379319190979, "lr": 9.362218813905932e-06, "objective/entropy": -353.1224365234375, "objective/kl": 32.73716354370117, "objective/non_score_reward": -3.273716449737549, "objective/rlhf_reward": -11.753230860739379, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 0.9558770656585693, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5755044221878052, "step": 998, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9991295337677002 }, { "episode": 16000, "epoch": 0.28759391738864726, "loss/policy_avg": 0.1776025891304016, "lr": 9.361579754601227e-06, "objective/entropy": -239.99009704589844, "objective/kl": 40.456748962402344, "objective/non_score_reward": -4.045674800872803, "objective/rlhf_reward": -14.841064026861815, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 104.76905822753906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5998741388320923, "step": 999, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9990432262420654 }, { "episode": 16016, "epoch": 0.2878815113060359, "loss/policy_avg": 0.43448132276535034, "lr": 9.360940695296524e-06, "objective/entropy": -165.23594665527344, "objective/kl": 34.56150817871094, "objective/non_score_reward": -3.456151247024536, "objective/rlhf_reward": -12.40077312727746, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.3405377864837646, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.721713662147522, "step": 1000, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0021862983703613 }, { "episode": 16032, "epoch": 0.28816910522342454, "loss/policy_avg": 0.062110334634780884, "lr": 9.36030163599182e-06, "objective/entropy": -334.8751220703125, "objective/kl": 48.182403564453125, "objective/non_score_reward": -4.818240642547607, "objective/rlhf_reward": -17.894360878554682, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 40.743797302246094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7242110371589661, "step": 1001, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9991108179092407 }, { "episode": 16048, "epoch": 0.2884566991408132, "loss/policy_avg": 0.16843795776367188, "lr": 9.359662576687117e-06, "objective/entropy": -178.37261962890625, "objective/kl": 38.65277862548828, "objective/non_score_reward": -3.8652780055999756, "objective/rlhf_reward": -13.636282797130654, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 6.08242130279541, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4641672372817993, "step": 1002, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0011541843414307 }, { "episode": 16064, "epoch": 0.2887442930582018, "loss/policy_avg": 0.8312854766845703, "lr": 9.359023517382414e-06, "objective/entropy": -192.33570861816406, "objective/kl": 41.62236404418945, "objective/non_score_reward": -4.162236213684082, "objective/rlhf_reward": -15.167991760189892, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 11.876651763916016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4978201389312744, "step": 1003, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999896764755249 }, { "episode": 16080, "epoch": 0.28903188697559046, "loss/policy_avg": 0.5974471569061279, "lr": 9.358384458077711e-06, "objective/entropy": -413.79345703125, "objective/kl": 42.26923751831055, "objective/non_score_reward": -4.226923942565918, "objective/rlhf_reward": -15.45709715327774, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 27.048864364624023, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5744391679763794, "step": 1004, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0014076232910156 }, { "episode": 16096, "epoch": 0.2893194808929791, "loss/policy_avg": -0.07608169317245483, "lr": 9.357745398773006e-06, "objective/entropy": 22.847129821777344, "objective/kl": 44.39021682739258, "objective/non_score_reward": -4.439021587371826, "objective/rlhf_reward": -13.356086587905885, "objective/scores": 1.1, "policy/approxkl_avg": 3.3195881843566895, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.43350324034690857, "step": 1005, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0106523036956787 }, { "episode": 16112, "epoch": 0.28960707481036774, "loss/policy_avg": -0.051718249917030334, "lr": 9.357106339468303e-06, "objective/entropy": -411.0462646484375, "objective/kl": 34.006595611572266, "objective/non_score_reward": -3.4006595611572266, "objective/rlhf_reward": -11.655226538853581, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 7.821033477783203, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6802281141281128, "step": 1006, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000204086303711 }, { "episode": 16128, "epoch": 0.28989466872775643, "loss/policy_avg": 1.4365193843841553, "lr": 9.3564672801636e-06, "objective/entropy": 36.35121154785156, "objective/kl": 48.03932571411133, "objective/non_score_reward": -4.803932189941406, "objective/rlhf_reward": -17.837127544967036, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 171.6711883544922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5944077372550964, "step": 1007, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973459243774414 }, { "episode": 16144, "epoch": 0.2901822626451451, "loss/policy_avg": 0.6622794270515442, "lr": 9.355828220858897e-06, "objective/entropy": -341.08636474609375, "objective/kl": 44.81585693359375, "objective/non_score_reward": -4.481585502624512, "objective/rlhf_reward": -15.526343679428102, "objective/scores": 0.6, "policy/approxkl_avg": 22.366912841796875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6235027313232422, "step": 1008, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9990150928497314 }, { "episode": 16160, "epoch": 0.2904698565625337, "loss/policy_avg": 0.13813085854053497, "lr": 9.355189161554194e-06, "objective/entropy": -410.0962219238281, "objective/kl": 36.286888122558594, "objective/non_score_reward": -3.6286890506744385, "objective/rlhf_reward": -13.173120310812621, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.510043144226074, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.578189492225647, "step": 1009, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.002636671066284 }, { "episode": 16176, "epoch": 0.29075745047992235, "loss/policy_avg": 1.200803279876709, "lr": 9.35455010224949e-06, "objective/entropy": -386.8243408203125, "objective/kl": 43.30828857421875, "objective/non_score_reward": -4.330828666687012, "objective/rlhf_reward": -15.71919563776644, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.595213890075684, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6449472904205322, "step": 1010, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.000666379928589 }, { "episode": 16192, "epoch": 0.291045044397311, "loss/policy_avg": 0.542273998260498, "lr": 9.353911042944786e-06, "objective/entropy": -197.82107543945312, "objective/kl": 40.368980407714844, "objective/non_score_reward": -4.036898136138916, "objective/rlhf_reward": -14.805956414251952, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 11.149993896484375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6117680072784424, "step": 1011, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9988832473754883 }, { "episode": 16208, "epoch": 0.29133263831469963, "loss/policy_avg": 1.6512575149536133, "lr": 9.353271983640083e-06, "objective/entropy": -147.8699951171875, "objective/kl": 41.501712799072266, "objective/non_score_reward": -4.150171279907227, "objective/rlhf_reward": -15.275173220664186, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.4657955169677734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5207608938217163, "step": 1012, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00093412399292 }, { "episode": 16224, "epoch": 0.2916202322320883, "loss/policy_avg": 1.1889203786849976, "lr": 9.352632924335378e-06, "objective/entropy": -201.8603515625, "objective/kl": 52.98762512207031, "objective/non_score_reward": -5.298763275146484, "objective/rlhf_reward": -19.853416493445067, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 29.695945739746094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5388121008872986, "step": 1013, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9970934391021729 }, { "episode": 16240, "epoch": 0.2919078261494769, "loss/policy_avg": 0.8848397135734558, "lr": 9.351993865030675e-06, "objective/entropy": -251.18350219726562, "objective/kl": 40.47871398925781, "objective/non_score_reward": -4.0478715896606445, "objective/rlhf_reward": -14.635226338115288, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.102502346038818, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.45284512639045715, "step": 1014, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9991495609283447 }, { "episode": 16256, "epoch": 0.2921954200668656, "loss/policy_avg": 6.158147811889648, "lr": 9.351354805725972e-06, "objective/entropy": -334.4002685546875, "objective/kl": 52.937347412109375, "objective/non_score_reward": -5.293735027313232, "objective/rlhf_reward": -19.774939870834352, "objective/scores": 0.35, "policy/approxkl_avg": 22.463531494140625, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.7278158664703369, "step": 1015, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9989426136016846 }, { "episode": 16272, "epoch": 0.29248301398425425, "loss/policy_avg": 0.5470467805862427, "lr": 9.350715746421269e-06, "objective/entropy": -340.1455993652344, "objective/kl": 42.530006408691406, "objective/non_score_reward": -4.253000259399414, "objective/rlhf_reward": -14.612001991271974, "objective/scores": 0.6, "policy/approxkl_avg": 4.3563923835754395, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5506184101104736, "step": 1016, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998374581336975 }, { "episode": 16288, "epoch": 0.2927706079016429, "loss/policy_avg": -0.7165657877922058, "lr": 9.350076687116566e-06, "objective/entropy": -257.8377685546875, "objective/kl": 54.90354919433594, "objective/non_score_reward": -5.490355491638184, "objective/rlhf_reward": -21.961421012878418, "objective/scores": 0.0, "policy/approxkl_avg": 61.379520416259766, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5922842621803284, "step": 1017, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0011649131774902 }, { "episode": 16304, "epoch": 0.2930582018190315, "loss/policy_avg": -0.044895902276039124, "lr": 9.34943762781186e-06, "objective/entropy": -161.30465698242188, "objective/kl": 42.11552810668945, "objective/non_score_reward": -4.21155309677124, "objective/rlhf_reward": -15.486962520812433, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 21.626054763793945, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7003655433654785, "step": 1018, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0008544921875 }, { "episode": 16320, "epoch": 0.29334579573642017, "loss/policy_avg": 0.5831981897354126, "lr": 9.348798568507158e-06, "objective/entropy": -343.4378967285156, "objective/kl": 37.45512390136719, "objective/non_score_reward": -3.7455124855041504, "objective/rlhf_reward": -13.58205018043518, "objective/scores": 0.35, "policy/approxkl_avg": 24.191818237304688, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.49668240547180176, "step": 1019, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9997553825378418 }, { "episode": 16336, "epoch": 0.2936333896538088, "loss/policy_avg": 0.5188800692558289, "lr": 9.348159509202455e-06, "objective/entropy": -149.73580932617188, "objective/kl": 34.930999755859375, "objective/non_score_reward": -3.493100166320801, "objective/rlhf_reward": -12.613150560592096, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 19.12890625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6794183254241943, "step": 1020, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991793632507324 }, { "episode": 16352, "epoch": 0.29392098357119745, "loss/policy_avg": 1.623844861984253, "lr": 9.347520449897751e-06, "objective/entropy": -203.75643920898438, "objective/kl": 46.22223663330078, "objective/non_score_reward": -4.622223854064941, "objective/rlhf_reward": -15.565175925136778, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 10.170186042785645, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.38755860924720764, "step": 1021, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998436689376831 }, { "episode": 16368, "epoch": 0.29420857748858614, "loss/policy_avg": 0.912840723991394, "lr": 9.346881390593048e-06, "objective/entropy": -243.21624755859375, "objective/kl": 44.858360290527344, "objective/non_score_reward": -4.485836029052734, "objective/rlhf_reward": -16.339224133555014, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 22.48243522644043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5979580283164978, "step": 1022, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998530626296997 }, { "episode": 16384, "epoch": 0.2944961714059748, "loss/policy_avg": 0.1390291452407837, "lr": 9.346242331288345e-06, "objective/entropy": -224.34439086914062, "objective/kl": 33.47166442871094, "objective/non_score_reward": -3.347166061401367, "objective/rlhf_reward": -10.98866400718689, "objective/scores": 0.6, "policy/approxkl_avg": 14.310105323791504, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3379809260368347, "step": 1023, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.996863842010498 }, { "episode": 16400, "epoch": 0.2947837653233634, "loss/policy_avg": 1.4108867645263672, "lr": 9.34560327198364e-06, "objective/entropy": -333.73321533203125, "objective/kl": 36.44195556640625, "objective/non_score_reward": -3.644195556640625, "objective/rlhf_reward": -13.095829847271801, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 38.91334533691406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5991511344909668, "step": 1024, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9986368417739868 }, { "episode": 16416, "epoch": 0.29507135924075206, "loss/policy_avg": -0.027962714433670044, "lr": 9.344964212678937e-06, "objective/entropy": -288.2020263671875, "objective/kl": 47.59730911254883, "objective/non_score_reward": -4.759731292724609, "objective/rlhf_reward": -17.697288325338988, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 131.87156677246094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.660065233707428, "step": 1025, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0027992725372314 }, { "episode": 16432, "epoch": 0.2953589531581407, "loss/policy_avg": -0.10264541208744049, "lr": 9.344325153374234e-06, "objective/entropy": -286.8740234375, "objective/kl": 35.00936508178711, "objective/non_score_reward": -3.500936508178711, "objective/rlhf_reward": -9.60374674797058, "objective/scores": 1.1, "policy/approxkl_avg": 17.956850051879883, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7084720134735107, "step": 1026, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991661310195923 }, { "episode": 16448, "epoch": 0.29564654707552934, "loss/policy_avg": -0.5100857615470886, "lr": 9.343686094069531e-06, "objective/entropy": -214.54248046875, "objective/kl": 48.03523254394531, "objective/non_score_reward": -4.803523540496826, "objective/rlhf_reward": -17.76349649867569, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.4936773777008057, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6490300297737122, "step": 1027, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000095844268799 }, { "episode": 16464, "epoch": 0.295934140992918, "loss/policy_avg": 1.6303435564041138, "lr": 9.343047034764828e-06, "objective/entropy": -387.60504150390625, "objective/kl": 44.20216369628906, "objective/non_score_reward": -4.4202165603637695, "objective/rlhf_reward": -16.280865049362184, "objective/scores": 0.35, "policy/approxkl_avg": 35.194862365722656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6738216876983643, "step": 1028, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.996845006942749 }, { "episode": 16480, "epoch": 0.2962217349103066, "loss/policy_avg": 0.4686249792575836, "lr": 9.342407975460123e-06, "objective/entropy": -142.21923828125, "objective/kl": 52.81471252441406, "objective/non_score_reward": -5.2814717292785645, "objective/rlhf_reward": -19.702055056293574, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 34.356407165527344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6376465559005737, "step": 1029, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998791217803955 }, { "episode": 16496, "epoch": 0.2965093288276953, "loss/policy_avg": -0.3098523020744324, "lr": 9.34176891615542e-06, "objective/entropy": -269.3659973144531, "objective/kl": 48.13602828979492, "objective/non_score_reward": -4.813602924346924, "objective/rlhf_reward": -17.830579121311274, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 66.14453125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.553117036819458, "step": 1030, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9972491264343262 }, { "episode": 16512, "epoch": 0.29679692274508396, "loss/policy_avg": 0.4175041913986206, "lr": 9.341129856850717e-06, "objective/entropy": -377.17852783203125, "objective/kl": 31.1174259185791, "objective/non_score_reward": -3.1117427349090576, "objective/rlhf_reward": -10.966018321926951, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 41.21112060546875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5598743557929993, "step": 1031, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999720811843872 }, { "episode": 16528, "epoch": 0.2970845166624726, "loss/policy_avg": -0.033336080610752106, "lr": 9.340490797546014e-06, "objective/entropy": -167.51239013671875, "objective/kl": 45.70878601074219, "objective/non_score_reward": -4.570878982543945, "objective/rlhf_reward": -16.924265825484675, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 83.41807556152344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7573748826980591, "step": 1032, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000248432159424 }, { "episode": 16544, "epoch": 0.29737211057986124, "loss/policy_avg": 0.5428593158721924, "lr": 9.33985173824131e-06, "objective/entropy": -306.9107971191406, "objective/kl": 38.025726318359375, "objective/non_score_reward": -3.80257248878479, "objective/rlhf_reward": -13.831687548247675, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 35.42212677001953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7339252233505249, "step": 1033, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9977548122406006 }, { "episode": 16560, "epoch": 0.2976597044972499, "loss/policy_avg": -0.20993635058403015, "lr": 9.339212678936606e-06, "objective/entropy": -437.900634765625, "objective/kl": 36.12052536010742, "objective/non_score_reward": -3.6120524406433105, "objective/rlhf_reward": -11.524490748287413, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 0.9966515898704529, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5690695643424988, "step": 1034, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0003795623779297 }, { "episode": 16576, "epoch": 0.2979472984146385, "loss/policy_avg": 0.8408477902412415, "lr": 9.338573619631903e-06, "objective/entropy": -144.86044311523438, "objective/kl": 41.49348831176758, "objective/non_score_reward": -4.149348735809326, "objective/rlhf_reward": -14.935536151350128, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 7.985556125640869, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7659680843353271, "step": 1035, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9998505115509033 }, { "episode": 16592, "epoch": 0.29823489233202716, "loss/policy_avg": 0.536194384098053, "lr": 9.3379345603272e-06, "objective/entropy": -176.1175537109375, "objective/kl": 28.366390228271484, "objective/non_score_reward": -2.836639165878296, "objective/rlhf_reward": -10.021044049292726, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.823239803314209, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6091227531433105, "step": 1036, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9980281591415405 }, { "episode": 16608, "epoch": 0.2985224862494158, "loss/policy_avg": -0.008736256510019302, "lr": 9.337295501022495e-06, "objective/entropy": -404.09527587890625, "objective/kl": 38.680389404296875, "objective/non_score_reward": -3.8680386543273926, "objective/rlhf_reward": -14.146641764670534, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 54.77943420410156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6204895973205566, "step": 1037, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998094081878662 }, { "episode": 16624, "epoch": 0.2988100801668045, "loss/policy_avg": 0.8643017411231995, "lr": 9.336656441717792e-06, "objective/entropy": -98.56698608398438, "objective/kl": 50.609886169433594, "objective/non_score_reward": -5.060988903045654, "objective/rlhf_reward": -18.510621325174967, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 62.80495834350586, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3460232615470886, "step": 1038, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99924635887146 }, { "episode": 16640, "epoch": 0.29909767408419313, "loss/policy_avg": 1.396580696105957, "lr": 9.336017382413088e-06, "objective/entropy": -234.46487426757812, "objective/kl": 42.40861511230469, "objective/non_score_reward": -4.240861892700195, "objective/rlhf_reward": -15.563446378707887, "objective/scores": 0.35, "policy/approxkl_avg": 41.982421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.536030113697052, "step": 1039, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9999630451202393 }, { "episode": 16656, "epoch": 0.2993852680015818, "loss/policy_avg": 0.21121357381343842, "lr": 9.335378323108385e-06, "objective/entropy": -365.265380859375, "objective/kl": 42.0318717956543, "objective/non_score_reward": -4.203187465667725, "objective/rlhf_reward": -15.471113732367186, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 13.188464164733887, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6613526344299316, "step": 1040, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9987871646881104 }, { "episode": 16672, "epoch": 0.2996728619189704, "loss/policy_avg": 2.4162611961364746, "lr": 9.334739263803682e-06, "objective/entropy": -312.92230224609375, "objective/kl": 39.64112091064453, "objective/non_score_reward": -3.9641122817993164, "objective/rlhf_reward": -13.90903742142194, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 6.51772928237915, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6307119131088257, "step": 1041, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0056896209716797 }, { "episode": 16688, "epoch": 0.29996045583635905, "loss/policy_avg": 0.02918429672718048, "lr": 9.334100204498977e-06, "objective/entropy": -308.1679992675781, "objective/kl": 38.570892333984375, "objective/non_score_reward": -3.8570895195007324, "objective/rlhf_reward": -14.102844748526735, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.2447240352630615, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5860459804534912, "step": 1042, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004019737243652 }, { "episode": 16704, "epoch": 0.3002480497537477, "loss/policy_avg": 1.9080119132995605, "lr": 9.333461145194274e-06, "objective/entropy": -126.83370971679688, "objective/kl": 45.10395812988281, "objective/non_score_reward": -4.5103960037231445, "objective/rlhf_reward": -16.716070685416383, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 14.95945930480957, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8886516094207764, "step": 1043, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9978519678115845 }, { "episode": 16720, "epoch": 0.30053564367113633, "loss/policy_avg": 0.9995555877685547, "lr": 9.332822085889571e-06, "objective/entropy": -386.8150329589844, "objective/kl": 23.879730224609375, "objective/non_score_reward": -2.387972831726074, "objective/rlhf_reward": -9.551890969276428, "objective/scores": 0.0, "policy/approxkl_avg": 38.0056037902832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6934055089950562, "step": 1044, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0008788108825684 }, { "episode": 16736, "epoch": 0.300823237588525, "loss/policy_avg": 0.14363548159599304, "lr": 9.332183026584868e-06, "objective/entropy": -129.5137939453125, "objective/kl": 62.553104400634766, "objective/non_score_reward": -6.255310535430908, "objective/rlhf_reward": -23.505469882281957, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 94.92597198486328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6987278461456299, "step": 1045, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9974005222320557 }, { "episode": 16752, "epoch": 0.30111083150591367, "loss/policy_avg": 0.5547482967376709, "lr": 9.331543967280165e-06, "objective/entropy": -343.81732177734375, "objective/kl": 39.68150329589844, "objective/non_score_reward": -3.9681506156921387, "objective/rlhf_reward": -13.472601985931398, "objective/scores": 0.6, "policy/approxkl_avg": 5.066805362701416, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6512348055839539, "step": 1046, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995243549346924 }, { "episode": 16768, "epoch": 0.3013984254233023, "loss/policy_avg": 0.6469882726669312, "lr": 9.330904907975462e-06, "objective/entropy": -305.9600830078125, "objective/kl": 47.63588333129883, "objective/non_score_reward": -4.763588905334473, "objective/rlhf_reward": -17.49809536239202, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.822155475616455, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8326170444488525, "step": 1047, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000539779663086 }, { "episode": 16784, "epoch": 0.30168601934069095, "loss/policy_avg": 1.2081701755523682, "lr": 9.330265848670757e-06, "objective/entropy": -233.3433380126953, "objective/kl": 35.255149841308594, "objective/non_score_reward": -3.525515079498291, "objective/rlhf_reward": -12.368726627031961, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 18.206623077392578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7223737835884094, "step": 1048, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9986642599105835 }, { "episode": 16800, "epoch": 0.3019736132580796, "loss/policy_avg": 0.10891236364841461, "lr": 9.329626789366054e-06, "objective/entropy": -299.0600280761719, "objective/kl": 31.47181510925293, "objective/non_score_reward": -3.147181510925293, "objective/rlhf_reward": -11.138127665133819, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.4740471839904785, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8737313747406006, "step": 1049, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0015664100646973 }, { "episode": 16816, "epoch": 0.3022612071754682, "loss/policy_avg": 0.3800656497478485, "lr": 9.32898773006135e-06, "objective/entropy": -315.836181640625, "objective/kl": 30.152355194091797, "objective/non_score_reward": -3.015235424041748, "objective/rlhf_reward": -10.735428843527956, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 0.3732711970806122, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7191339731216431, "step": 1050, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000505208969116 }, { "episode": 16832, "epoch": 0.30254880109285687, "loss/policy_avg": 0.1446080356836319, "lr": 9.328348670756648e-06, "objective/entropy": -344.92327880859375, "objective/kl": 42.85297775268555, "objective/non_score_reward": -4.285297870635986, "objective/rlhf_reward": -15.717359383304682, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 52.67206573486328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6258249282836914, "step": 1051, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.997635841369629 }, { "episode": 16848, "epoch": 0.3028363950102455, "loss/policy_avg": 0.9433111548423767, "lr": 9.327709611451944e-06, "objective/entropy": -215.4765625, "objective/kl": 51.37515640258789, "objective/non_score_reward": -5.137515544891357, "objective/rlhf_reward": -16.15006265640259, "objective/scores": 1.1, "policy/approxkl_avg": 8.241392135620117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6037826538085938, "step": 1052, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9992796182632446 }, { "episode": 16864, "epoch": 0.3031239889276342, "loss/policy_avg": 0.46238401532173157, "lr": 9.32707055214724e-06, "objective/entropy": -88.19042205810547, "objective/kl": 53.18672180175781, "objective/non_score_reward": -5.3186726570129395, "objective/rlhf_reward": -19.541356817881265, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 17.840152740478516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47705012559890747, "step": 1053, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991493225097656 }, { "episode": 16880, "epoch": 0.30341158284502284, "loss/policy_avg": 0.2535289525985718, "lr": 9.326431492842537e-06, "objective/entropy": -260.73590087890625, "objective/kl": 47.46588134765625, "objective/non_score_reward": -4.746587753295898, "objective/rlhf_reward": -17.660838398963136, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 26.296751022338867, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6439003944396973, "step": 1054, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9989542961120605 }, { "episode": 16896, "epoch": 0.3036991767624115, "loss/policy_avg": -0.44362181425094604, "lr": 9.325792433537833e-06, "objective/entropy": -415.29071044921875, "objective/kl": 37.31279754638672, "objective/non_score_reward": -3.7312798500061035, "objective/rlhf_reward": -14.92512035369873, "objective/scores": 0.0, "policy/approxkl_avg": 6.905259132385254, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5757873058319092, "step": 1055, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0002002716064453 }, { "episode": 16912, "epoch": 0.3039867706798001, "loss/policy_avg": 0.9405224323272705, "lr": 9.325153374233129e-06, "objective/entropy": -295.5430908203125, "objective/kl": 41.93619918823242, "objective/non_score_reward": -4.193620204925537, "objective/rlhf_reward": -12.37448081970215, "objective/scores": 1.1, "policy/approxkl_avg": 18.36953353881836, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6437206268310547, "step": 1056, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979463815689087 }, { "episode": 16928, "epoch": 0.30427436459718876, "loss/policy_avg": 2.932800769805908, "lr": 9.324514314928425e-06, "objective/entropy": -274.6573791503906, "objective/kl": 47.688812255859375, "objective/non_score_reward": -4.768881320953369, "objective/rlhf_reward": -19.075525045394897, "objective/scores": 0.0, "policy/approxkl_avg": 9.993431091308594, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6071447134017944, "step": 1057, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9977655410766602 }, { "episode": 16944, "epoch": 0.3045619585145774, "loss/policy_avg": 1.1020692586898804, "lr": 9.323875255623722e-06, "objective/entropy": -164.4237060546875, "objective/kl": 31.39014434814453, "objective/non_score_reward": -3.139014720916748, "objective/rlhf_reward": -11.075105789120554, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.603264331817627, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3782082796096802, "step": 1058, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998844861984253 }, { "episode": 16960, "epoch": 0.30484955243196604, "loss/policy_avg": 0.8170310258865356, "lr": 9.32323619631902e-06, "objective/entropy": -345.8731994628906, "objective/kl": 43.98723602294922, "objective/non_score_reward": -4.39872407913208, "objective/rlhf_reward": -16.235646450255793, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.8753180503845215, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6707043051719666, "step": 1059, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9989954233169556 }, { "episode": 16976, "epoch": 0.30513714634935474, "loss/policy_avg": 0.26879221200942993, "lr": 9.322597137014316e-06, "objective/entropy": -228.78077697753906, "objective/kl": 39.21705627441406, "objective/non_score_reward": -3.921705961227417, "objective/rlhf_reward": -14.36131123068921, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 8.223040580749512, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5167461633682251, "step": 1060, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9998619556427002 }, { "episode": 16992, "epoch": 0.3054247402667434, "loss/policy_avg": 0.7971950769424438, "lr": 9.321958077709611e-06, "objective/entropy": -379.18731689453125, "objective/kl": 44.505802154541016, "objective/non_score_reward": -4.450580596923828, "objective/rlhf_reward": -16.42371926554809, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 76.0206298828125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6505694389343262, "step": 1061, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9987213611602783 }, { "episode": 17008, "epoch": 0.305712334184132, "loss/policy_avg": 1.5369600057601929, "lr": 9.321319018404908e-06, "objective/entropy": -303.25933837890625, "objective/kl": 54.402259826660156, "objective/non_score_reward": -5.440226078033447, "objective/rlhf_reward": -20.099044804990875, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.717708587646484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7957777976989746, "step": 1062, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.0009021759033203 }, { "episode": 17024, "epoch": 0.30599992810152066, "loss/policy_avg": 0.9507750272750854, "lr": 9.320679959100205e-06, "objective/entropy": 16.85193634033203, "objective/kl": 55.463462829589844, "objective/non_score_reward": -5.546346664428711, "objective/rlhf_reward": -20.581265959803183, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.1870317459106445, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5107527375221252, "step": 1063, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998078346252441 }, { "episode": 17040, "epoch": 0.3062875220189093, "loss/policy_avg": 2.855557918548584, "lr": 9.320040899795502e-06, "objective/entropy": -179.23472595214844, "objective/kl": 39.21595764160156, "objective/non_score_reward": -3.921596050262451, "objective/rlhf_reward": -13.286383962631227, "objective/scores": 0.6, "policy/approxkl_avg": 6.619713306427002, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6207290887832642, "step": 1064, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986143112182617 }, { "episode": 17056, "epoch": 0.30657511593629794, "loss/policy_avg": 0.026278316974639893, "lr": 9.319401840490799e-06, "objective/entropy": -306.245361328125, "objective/kl": 26.791324615478516, "objective/non_score_reward": -2.6791324615478516, "objective/rlhf_reward": -9.391017708808107, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 51.70866012573242, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7264795303344727, "step": 1065, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9986615180969238 }, { "episode": 17072, "epoch": 0.3068627098536866, "loss/policy_avg": -0.15116065740585327, "lr": 9.318762781186094e-06, "objective/entropy": 62.529136657714844, "objective/kl": 22.53085708618164, "objective/non_score_reward": -2.2530856132507324, "objective/rlhf_reward": -9.01234233379364, "objective/scores": 0.0, "policy/approxkl_avg": 21.801652908325195, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7525181770324707, "step": 1066, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999632835388184 }, { "episode": 17088, "epoch": 0.3071503037710752, "loss/policy_avg": 0.28861698508262634, "lr": 9.318123721881391e-06, "objective/entropy": -352.76885986328125, "objective/kl": 35.70200729370117, "objective/non_score_reward": -3.5702006816864014, "objective/rlhf_reward": -12.856970627506342, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 8.341144561767578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6388325095176697, "step": 1067, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9993162155151367 }, { "episode": 17104, "epoch": 0.3074378976884639, "loss/policy_avg": -0.20543266832828522, "lr": 9.317484662576688e-06, "objective/entropy": -143.38095092773438, "objective/kl": 38.71528625488281, "objective/non_score_reward": -3.871528387069702, "objective/rlhf_reward": -14.062281687458125, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 25.02120590209961, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46889883279800415, "step": 1068, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9987019300460815 }, { "episode": 17120, "epoch": 0.30772549160585255, "loss/policy_avg": 0.19973969459533691, "lr": 9.316845603271985e-06, "objective/entropy": -411.72735595703125, "objective/kl": 44.07129669189453, "objective/non_score_reward": -4.407129764556885, "objective/rlhf_reward": -16.22851905822754, "objective/scores": 0.35, "policy/approxkl_avg": 1.6542760133743286, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7966042757034302, "step": 1069, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0013904571533203 }, { "episode": 17136, "epoch": 0.3080130855232412, "loss/policy_avg": 1.4014922380447388, "lr": 9.316206543967282e-06, "objective/entropy": -341.11773681640625, "objective/kl": 43.00127410888672, "objective/non_score_reward": -4.300127983093262, "objective/rlhf_reward": -15.538651233137237, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 27.992881774902344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6621806025505066, "step": 1070, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999094009399414 }, { "episode": 17152, "epoch": 0.30830067944062983, "loss/policy_avg": 0.5178280472755432, "lr": 9.315567484662578e-06, "objective/entropy": -412.7268371582031, "objective/kl": 26.739971160888672, "objective/non_score_reward": -2.673997163772583, "objective/rlhf_reward": -9.034129147947418, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 7.041400909423828, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6152924299240112, "step": 1071, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.000479221343994 }, { "episode": 17168, "epoch": 0.3085882733580185, "loss/policy_avg": -0.2718881070613861, "lr": 9.314928425357874e-06, "objective/entropy": -375.44464111328125, "objective/kl": 47.6744384765625, "objective/non_score_reward": -4.767443656921387, "objective/rlhf_reward": -16.66977415084839, "objective/scores": 0.6, "policy/approxkl_avg": 19.56097412109375, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7692055702209473, "step": 1072, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001349449157715 }, { "episode": 17184, "epoch": 0.3088758672754071, "loss/policy_avg": 1.4666513204574585, "lr": 9.31428936605317e-06, "objective/entropy": -174.66970825195312, "objective/kl": 34.193389892578125, "objective/non_score_reward": -3.419339656829834, "objective/rlhf_reward": -12.298756101218562, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 15.399024963378906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7103724479675293, "step": 1073, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989737272262573 }, { "episode": 17200, "epoch": 0.30916346119279575, "loss/policy_avg": 0.12603211402893066, "lr": 9.313650306748467e-06, "objective/entropy": -337.2054443359375, "objective/kl": 33.33768081665039, "objective/non_score_reward": -3.333768367767334, "objective/rlhf_reward": -11.884475330920562, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 0.6525580883026123, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7645479440689087, "step": 1074, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0000722408294678 }, { "episode": 17216, "epoch": 0.3094510551101844, "loss/policy_avg": 0.8019425868988037, "lr": 9.313011247443764e-06, "objective/entropy": -333.5313720703125, "objective/kl": 29.647350311279297, "objective/non_score_reward": -2.9647350311279297, "objective/rlhf_reward": -9.911529133992131, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 37.83848571777344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8082382678985596, "step": 1075, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9983893632888794 }, { "episode": 17232, "epoch": 0.3097386490275731, "loss/policy_avg": 0.010537967085838318, "lr": 9.312372188139061e-06, "objective/entropy": -282.5594482421875, "objective/kl": 36.237037658691406, "objective/non_score_reward": -3.623703718185425, "objective/rlhf_reward": -12.66998612430961, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.97419548034668, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5336211323738098, "step": 1076, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0005412101745605 }, { "episode": 17248, "epoch": 0.3100262429449617, "loss/policy_avg": -0.2652335464954376, "lr": 9.311733128834356e-06, "objective/entropy": 27.890277862548828, "objective/kl": 40.74696731567383, "objective/non_score_reward": -4.0746965408325195, "objective/rlhf_reward": -14.817834022457959, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 54.30160140991211, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4673164486885071, "step": 1077, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0019400119781494 }, { "episode": 17264, "epoch": 0.31031383686235037, "loss/policy_avg": 0.18203112483024597, "lr": 9.311094069529653e-06, "objective/entropy": -2.028911590576172, "objective/kl": 37.40778732299805, "objective/non_score_reward": -3.740778684616089, "objective/rlhf_reward": -14.963114976882935, "objective/scores": 0.0, "policy/approxkl_avg": 18.37608528137207, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.2910824418067932, "step": 1078, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0010132789611816 }, { "episode": 17280, "epoch": 0.310601430779739, "loss/policy_avg": 0.5637036561965942, "lr": 9.310455010224948e-06, "objective/entropy": -219.67300415039062, "objective/kl": 49.5472412109375, "objective/non_score_reward": -4.954723834991455, "objective/rlhf_reward": -18.395063002307978, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 30.05399513244629, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6825199127197266, "step": 1079, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9987962245941162 }, { "episode": 17296, "epoch": 0.31088902469712765, "loss/policy_avg": 1.7865959405899048, "lr": 9.309815950920245e-06, "objective/entropy": -183.074951171875, "objective/kl": 43.60378646850586, "objective/non_score_reward": -4.360378742218018, "objective/rlhf_reward": -15.96056282799995, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 13.183867454528809, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7107292413711548, "step": 1080, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9992387294769287 }, { "episode": 17312, "epoch": 0.3111766186145163, "loss/policy_avg": 0.5400304794311523, "lr": 9.309176891615542e-06, "objective/entropy": -372.7396240234375, "objective/kl": 33.908042907714844, "objective/non_score_reward": -3.3908042907714844, "objective/rlhf_reward": -11.440510215536627, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.037405014038086, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6166536211967468, "step": 1081, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999457836151123 }, { "episode": 17328, "epoch": 0.3114642125319049, "loss/policy_avg": 0.24341818690299988, "lr": 9.308537832310839e-06, "objective/entropy": -413.0890808105469, "objective/kl": 23.821657180786133, "objective/non_score_reward": -2.3821659088134766, "objective/rlhf_reward": -7.866804366529571, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.0880775451660156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6126406192779541, "step": 1082, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000682830810547 }, { "episode": 17344, "epoch": 0.3117518064492936, "loss/policy_avg": 0.07967303693294525, "lr": 9.307898773006136e-06, "objective/entropy": -256.42462158203125, "objective/kl": 39.169315338134766, "objective/non_score_reward": -3.916931629180908, "objective/rlhf_reward": -14.063606295649128, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 21.123905181884766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8925327062606812, "step": 1083, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.996124267578125 }, { "episode": 17360, "epoch": 0.31203940036668226, "loss/policy_avg": 0.23759812116622925, "lr": 9.307259713701433e-06, "objective/entropy": -383.95867919921875, "objective/kl": 35.23279571533203, "objective/non_score_reward": -3.5232796669006348, "objective/rlhf_reward": -12.431259637296783, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 23.20836067199707, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7214142680168152, "step": 1084, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9979887008666992 }, { "episode": 17376, "epoch": 0.3123269942840709, "loss/policy_avg": 0.3815416395664215, "lr": 9.306620654396728e-06, "objective/entropy": -322.26068115234375, "objective/kl": 39.75225067138672, "objective/non_score_reward": -3.9752249717712402, "objective/rlhf_reward": -14.34464010497625, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 4.111967086791992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6916395425796509, "step": 1085, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000641107559204 }, { "episode": 17392, "epoch": 0.31261458820145954, "loss/policy_avg": 0.2480873018503189, "lr": 9.305981595092025e-06, "objective/entropy": -294.0469970703125, "objective/kl": 44.66571807861328, "objective/non_score_reward": -4.466571807861328, "objective/rlhf_reward": -16.262168679300864, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.297031283378601, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5874477028846741, "step": 1086, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999356985092163 }, { "episode": 17408, "epoch": 0.3129021821188482, "loss/policy_avg": 0.036078356206417084, "lr": 9.305342535787322e-06, "objective/entropy": -245.5867919921875, "objective/kl": 43.741416931152344, "objective/non_score_reward": -4.374141693115234, "objective/rlhf_reward": -16.17105415824048, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 22.203502655029297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5387413501739502, "step": 1087, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9979183673858643 }, { "episode": 17424, "epoch": 0.3131897760362368, "loss/policy_avg": 0.19466792047023773, "lr": 9.304703476482619e-06, "objective/entropy": -396.5644226074219, "objective/kl": 34.08475875854492, "objective/non_score_reward": -3.408475875854492, "objective/rlhf_reward": -11.686492632107672, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 24.210182189941406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7581065893173218, "step": 1088, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9994767904281616 }, { "episode": 17440, "epoch": 0.31347736995362546, "loss/policy_avg": 0.4639536738395691, "lr": 9.304064417177915e-06, "objective/entropy": -334.7198181152344, "objective/kl": 41.038028717041016, "objective/non_score_reward": -4.1038031578063965, "objective/rlhf_reward": -14.015212631225587, "objective/scores": 0.6, "policy/approxkl_avg": 2.509368896484375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9146820306777954, "step": 1089, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0003623962402344 }, { "episode": 17456, "epoch": 0.3137649638710141, "loss/policy_avg": 1.8447387218475342, "lr": 9.30342535787321e-06, "objective/entropy": -167.6761474609375, "objective/kl": 44.13557434082031, "objective/non_score_reward": -4.413558006286621, "objective/rlhf_reward": -15.82940172699363, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.643352508544922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5217970609664917, "step": 1090, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9973323345184326 }, { "episode": 17472, "epoch": 0.3140525577884028, "loss/policy_avg": -0.23901599645614624, "lr": 9.302786298568508e-06, "objective/entropy": -261.5755310058594, "objective/kl": 48.49864196777344, "objective/non_score_reward": -4.8498640060424805, "objective/rlhf_reward": -17.94885740718399, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 5.599089622497559, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6075021624565125, "step": 1091, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0020885467529297 }, { "episode": 17488, "epoch": 0.31434015170579144, "loss/policy_avg": 0.26786884665489197, "lr": 9.302147239263804e-06, "objective/entropy": -351.76434326171875, "objective/kl": 47.7818489074707, "objective/non_score_reward": -4.77818489074707, "objective/rlhf_reward": -16.189021502376768, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.582246780395508, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7347859740257263, "step": 1092, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9980502128601074 }, { "episode": 17504, "epoch": 0.3146277456231801, "loss/policy_avg": -0.12303707003593445, "lr": 9.301508179959101e-06, "objective/entropy": -427.9180908203125, "objective/kl": 32.77983856201172, "objective/non_score_reward": -3.2779839038848877, "objective/rlhf_reward": -11.770300438910155, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 0.8784555196762085, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6480517983436584, "step": 1093, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0015769004821777 }, { "episode": 17520, "epoch": 0.3149153395405687, "loss/policy_avg": 0.4594835042953491, "lr": 9.300869120654398e-06, "objective/entropy": -325.3517761230469, "objective/kl": 39.891822814941406, "objective/non_score_reward": -3.989182233810425, "objective/rlhf_reward": -14.615093281775145, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 113.800537109375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6559237837791443, "step": 1094, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9982614517211914 }, { "episode": 17536, "epoch": 0.31520293345795736, "loss/policy_avg": 0.6721220016479492, "lr": 9.300230061349695e-06, "objective/entropy": -308.3196105957031, "objective/kl": 35.372657775878906, "objective/non_score_reward": -3.5372657775878906, "objective/rlhf_reward": -12.54494288927706, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.719980001449585, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8252613544464111, "step": 1095, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9973175525665283 }, { "episode": 17552, "epoch": 0.315490527375346, "loss/policy_avg": 0.3110813498497009, "lr": 9.29959100204499e-06, "objective/entropy": -396.62567138671875, "objective/kl": 24.451473236083984, "objective/non_score_reward": -2.4451475143432617, "objective/rlhf_reward": -5.380589818954467, "objective/scores": 1.1, "policy/approxkl_avg": 1.7703379392623901, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6306072473526001, "step": 1096, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0027711391448975 }, { "episode": 17568, "epoch": 0.31577812129273464, "loss/policy_avg": 0.17199182510375977, "lr": 9.298951942740287e-06, "objective/entropy": -54.00721740722656, "objective/kl": 48.57429504394531, "objective/non_score_reward": -4.8574299812316895, "objective/rlhf_reward": -17.94876778405464, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.585174560546875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.42359066009521484, "step": 1097, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999900460243225 }, { "episode": 17584, "epoch": 0.31606571521012333, "loss/policy_avg": -0.11684739589691162, "lr": 9.298312883435584e-06, "objective/entropy": -307.7109375, "objective/kl": 42.85301208496094, "objective/non_score_reward": -4.285301208496094, "objective/rlhf_reward": -15.018498124853643, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 52.40364074707031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6246902942657471, "step": 1098, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0012102127075195 }, { "episode": 17600, "epoch": 0.316353309127512, "loss/policy_avg": 8.779764175415039e-05, "lr": 9.29767382413088e-06, "objective/entropy": -191.7771453857422, "objective/kl": 37.2208251953125, "objective/non_score_reward": -3.7220826148986816, "objective/rlhf_reward": -13.154996887842813, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.728720188140869, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8339232206344604, "step": 1099, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0024797916412354 }, { "episode": 17616, "epoch": 0.3166409030449006, "loss/policy_avg": 0.9306607842445374, "lr": 9.297034764826178e-06, "objective/entropy": -190.11915588378906, "objective/kl": 42.54877853393555, "objective/non_score_reward": -4.25487756729126, "objective/rlhf_reward": -15.463250963893486, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 19.97567367553711, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5922030210494995, "step": 1100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9969046115875244 }, { "episode": 17632, "epoch": 0.31692849696228925, "loss/policy_avg": 0.4076872766017914, "lr": 9.296395705521473e-06, "objective/entropy": -323.73876953125, "objective/kl": 35.8895263671875, "objective/non_score_reward": -3.5889527797698975, "objective/rlhf_reward": -12.408399890141423, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 26.755075454711914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5746772289276123, "step": 1101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9977562427520752 }, { "episode": 17648, "epoch": 0.3172160908796779, "loss/policy_avg": 1.4372249841690063, "lr": 9.29575664621677e-06, "objective/entropy": -320.81988525390625, "objective/kl": 41.59331512451172, "objective/non_score_reward": -4.159331321716309, "objective/rlhf_reward": -12.237325525283815, "objective/scores": 1.1, "policy/approxkl_avg": 16.800437927246094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5043920278549194, "step": 1102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.99782133102417 }, { "episode": 17664, "epoch": 0.31750368479706653, "loss/policy_avg": 0.40065181255340576, "lr": 9.295117586912065e-06, "objective/entropy": -336.7508850097656, "objective/kl": 31.292213439941406, "objective/non_score_reward": -3.129220962524414, "objective/rlhf_reward": -11.06628618678604, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 37.63865661621094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6837561130523682, "step": 1103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9986858367919922 }, { "episode": 17680, "epoch": 0.31779127871445517, "loss/policy_avg": 0.09660424292087555, "lr": 9.294478527607362e-06, "objective/entropy": -153.93927001953125, "objective/kl": 38.50758361816406, "objective/non_score_reward": -3.8507578372955322, "objective/rlhf_reward": -14.04378172132818, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 11.370977401733398, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5768933296203613, "step": 1104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0033788681030273 }, { "episode": 17696, "epoch": 0.3180788726318438, "loss/policy_avg": 0.8987742066383362, "lr": 9.293839468302659e-06, "objective/entropy": -238.22705078125, "objective/kl": 37.34947967529297, "objective/non_score_reward": -3.734947919845581, "objective/rlhf_reward": -13.489192823977813, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 17.397953033447266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6979916095733643, "step": 1105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.996543049812317 }, { "episode": 17712, "epoch": 0.3183664665492325, "loss/policy_avg": 0.6566457748413086, "lr": 9.293200408997956e-06, "objective/entropy": -374.3487854003906, "objective/kl": 32.60210418701172, "objective/non_score_reward": -3.2602105140686035, "objective/rlhf_reward": -10.640841817855836, "objective/scores": 0.6, "policy/approxkl_avg": 4.950079917907715, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5696313977241516, "step": 1106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9996005296707153 }, { "episode": 17728, "epoch": 0.31865406046662115, "loss/policy_avg": 0.24018500745296478, "lr": 9.292561349693252e-06, "objective/entropy": -133.2898712158203, "objective/kl": 45.015235900878906, "objective/non_score_reward": -4.501523971557617, "objective/rlhf_reward": -13.60609588623047, "objective/scores": 1.1, "policy/approxkl_avg": 3.4814629554748535, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5956703424453735, "step": 1107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000743865966797 }, { "episode": 17744, "epoch": 0.3189416543840098, "loss/policy_avg": 0.8935117721557617, "lr": 9.29192229038855e-06, "objective/entropy": -96.28314208984375, "objective/kl": 46.588714599609375, "objective/non_score_reward": -4.658871650695801, "objective/rlhf_reward": -17.30997279646985, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.1111669540405273, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5983787775039673, "step": 1108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9997496604919434 }, { "episode": 17760, "epoch": 0.3192292483013984, "loss/policy_avg": 0.13787852227687836, "lr": 9.291283231083845e-06, "objective/entropy": -387.2899169921875, "objective/kl": 30.800121307373047, "objective/non_score_reward": -3.080012083053589, "objective/rlhf_reward": -10.896215994556513, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.0769567489624023, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7568919658660889, "step": 1109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998976230621338 }, { "episode": 17776, "epoch": 0.31951684221878707, "loss/policy_avg": -0.8365802764892578, "lr": 9.290644171779141e-06, "objective/entropy": -215.361572265625, "objective/kl": 43.039161682128906, "objective/non_score_reward": -4.303915977478027, "objective/rlhf_reward": -15.856414758895319, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 29.877323150634766, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6622889041900635, "step": 1110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.002612590789795 }, { "episode": 17792, "epoch": 0.3198044361361757, "loss/policy_avg": -0.14821678400039673, "lr": 9.290005112474438e-06, "objective/entropy": -369.02874755859375, "objective/kl": 36.46915817260742, "objective/non_score_reward": -3.646916151046753, "objective/rlhf_reward": -13.262151513129396, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.8801069259643555, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5768126249313354, "step": 1111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001115322113037 }, { "episode": 17808, "epoch": 0.32009203005356435, "loss/policy_avg": 1.0569783449172974, "lr": 9.289366053169735e-06, "objective/entropy": -184.51934814453125, "objective/kl": 42.531089782714844, "objective/non_score_reward": -4.253108978271484, "objective/rlhf_reward": -15.06502539940351, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 10.124497413635254, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.39696118235588074, "step": 1112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0000112056732178 }, { "episode": 17824, "epoch": 0.320379623970953, "loss/policy_avg": 0.09561894834041595, "lr": 9.288726993865032e-06, "objective/entropy": -288.62420654296875, "objective/kl": 36.91450500488281, "objective/non_score_reward": -3.691450595855713, "objective/rlhf_reward": -13.440290246039552, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 14.518285751342773, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5317392945289612, "step": 1113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000993013381958 }, { "episode": 17840, "epoch": 0.3206672178883417, "loss/policy_avg": 1.6176202297210693, "lr": 9.288087934560327e-06, "objective/entropy": -364.4542236328125, "objective/kl": 32.085243225097656, "objective/non_score_reward": -3.208524465560913, "objective/rlhf_reward": -10.7113913915315, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 23.10379409790039, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.727960467338562, "step": 1114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9965965747833252 }, { "episode": 17856, "epoch": 0.3209548118057303, "loss/policy_avg": 0.9114197492599487, "lr": 9.287448875255624e-06, "objective/entropy": -347.46783447265625, "objective/kl": 36.07443618774414, "objective/non_score_reward": -3.6074438095092773, "objective/rlhf_reward": -10.029774999618532, "objective/scores": 1.1, "policy/approxkl_avg": 31.37342071533203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7413814067840576, "step": 1115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9980356693267822 }, { "episode": 17872, "epoch": 0.32124240572311896, "loss/policy_avg": 0.06097881495952606, "lr": 9.286809815950921e-06, "objective/entropy": -265.2679443359375, "objective/kl": 39.52857971191406, "objective/non_score_reward": -3.952857732772827, "objective/rlhf_reward": -14.485918078452272, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 24.505386352539062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6894311904907227, "step": 1116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991822242736816 }, { "episode": 17888, "epoch": 0.3215299996405076, "loss/policy_avg": 0.1772969365119934, "lr": 9.286170756646218e-06, "objective/entropy": -368.5130615234375, "objective/kl": 25.519805908203125, "objective/non_score_reward": -2.55198073387146, "objective/rlhf_reward": -8.80792293548584, "objective/scores": 0.35, "policy/approxkl_avg": 6.3811235427856445, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.557669997215271, "step": 1117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0001704692840576 }, { "episode": 17904, "epoch": 0.32181759355789624, "loss/policy_avg": 0.11265039443969727, "lr": 9.285531697341515e-06, "objective/entropy": -335.3668518066406, "objective/kl": 34.044273376464844, "objective/non_score_reward": -3.4044275283813477, "objective/rlhf_reward": -12.101938092502293, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.047215461730957, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8071970343589783, "step": 1118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9983731508255005 }, { "episode": 17920, "epoch": 0.3221051874752849, "loss/policy_avg": 0.2099117487668991, "lr": 9.284892638036812e-06, "objective/entropy": -92.14942932128906, "objective/kl": 33.853721618652344, "objective/non_score_reward": -3.3853721618652344, "objective/rlhf_reward": -12.199853232412963, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.183546543121338, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8027639389038086, "step": 1119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000032901763916 }, { "episode": 17936, "epoch": 0.3223927813926735, "loss/policy_avg": 1.2880432605743408, "lr": 9.284253578732107e-06, "objective/entropy": -360.373779296875, "objective/kl": 30.932432174682617, "objective/non_score_reward": -3.093243360519409, "objective/rlhf_reward": -9.972973203659059, "objective/scores": 0.6, "policy/approxkl_avg": 8.325624465942383, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.48624560236930847, "step": 1120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.998833417892456 }, { "episode": 17952, "epoch": 0.3226803753100622, "loss/policy_avg": 0.24510294198989868, "lr": 9.283614519427404e-06, "objective/entropy": -433.644775390625, "objective/kl": 42.8654670715332, "objective/non_score_reward": -4.28654670715332, "objective/rlhf_reward": -15.542067084375937, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.609282493591309, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.586551308631897, "step": 1121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990415573120117 }, { "episode": 17968, "epoch": 0.32296796922745086, "loss/policy_avg": 0.09778931736946106, "lr": 9.2829754601227e-06, "objective/entropy": -383.89117431640625, "objective/kl": 21.19662094116211, "objective/non_score_reward": -2.119662284851074, "objective/rlhf_reward": -6.7453155676523835, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.699183464050293, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6772907972335815, "step": 1122, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9988560676574707 }, { "episode": 17984, "epoch": 0.3232555631448395, "loss/policy_avg": 0.24572011828422546, "lr": 9.282336400817996e-06, "objective/entropy": -327.8108825683594, "objective/kl": 34.61373519897461, "objective/non_score_reward": -3.461373805999756, "objective/rlhf_reward": -12.020665998729775, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.840553283691406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.695665717124939, "step": 1123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000403881072998 }, { "episode": 18000, "epoch": 0.32354315706222814, "loss/policy_avg": -1.0019803047180176, "lr": 9.281697341513293e-06, "objective/entropy": -307.68548583984375, "objective/kl": 38.93940734863281, "objective/non_score_reward": -3.8939409255981445, "objective/rlhf_reward": -14.019504635539604, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 13.207256317138672, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7523318529129028, "step": 1124, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0070571899414062 }, { "episode": 18016, "epoch": 0.3238307509796168, "loss/policy_avg": 0.8281399011611938, "lr": 9.28105828220859e-06, "objective/entropy": -259.5808410644531, "objective/kl": 35.205833435058594, "objective/non_score_reward": -3.520583152770996, "objective/rlhf_reward": -12.134921858982977, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 18.91745948791504, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4909522831439972, "step": 1125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9986555576324463 }, { "episode": 18032, "epoch": 0.3241183448970054, "loss/policy_avg": -0.09073008596897125, "lr": 9.280419222903886e-06, "objective/entropy": -308.7064208984375, "objective/kl": 31.744949340820312, "objective/non_score_reward": -3.174494743347168, "objective/rlhf_reward": -10.750568459706242, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.127901554107666, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6888346672058105, "step": 1126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9997719526290894 }, { "episode": 18048, "epoch": 0.32440593881439406, "loss/policy_avg": 0.4604296088218689, "lr": 9.279780163599183e-06, "objective/entropy": -410.888916015625, "objective/kl": 30.064613342285156, "objective/non_score_reward": -3.0064616203308105, "objective/rlhf_reward": -10.57524786433731, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.9717319011688232, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6274343729019165, "step": 1127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000220537185669 }, { "episode": 18064, "epoch": 0.3246935327317827, "loss/policy_avg": 1.558852195739746, "lr": 9.279141104294478e-06, "objective/entropy": -265.7242431640625, "objective/kl": 37.62788772583008, "objective/non_score_reward": -3.762789011001587, "objective/rlhf_reward": -12.127437029720518, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 11.527868270874023, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.706502377986908, "step": 1128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9975552558898926 }, { "episode": 18080, "epoch": 0.3249811266491714, "loss/policy_avg": 0.8726902604103088, "lr": 9.278502044989775e-06, "objective/entropy": -296.6790466308594, "objective/kl": 40.65041732788086, "objective/non_score_reward": -4.065041542053223, "objective/rlhf_reward": -13.336447869182798, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.849645137786865, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6638140678405762, "step": 1129, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9985852241516113 }, { "episode": 18096, "epoch": 0.32526872056656003, "loss/policy_avg": 0.04483785480260849, "lr": 9.277862985685072e-06, "objective/entropy": -370.07318115234375, "objective/kl": 37.146427154541016, "objective/non_score_reward": -3.7146425247192383, "objective/rlhf_reward": -13.125237480799356, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 11.139961242675781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6867357492446899, "step": 1130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9987016916275024 }, { "episode": 18112, "epoch": 0.32555631448394867, "loss/policy_avg": 2.475396156311035, "lr": 9.277223926380369e-06, "objective/entropy": -201.3065948486328, "objective/kl": 56.50867462158203, "objective/non_score_reward": -5.650867938995361, "objective/rlhf_reward": -21.179638941486445, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 12.423762321472168, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5622425079345703, "step": 1131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9981927871704102 }, { "episode": 18128, "epoch": 0.3258439084013373, "loss/policy_avg": 0.10702167451381683, "lr": 9.276584867075666e-06, "objective/entropy": -293.309326171875, "objective/kl": 44.60116958618164, "objective/non_score_reward": -4.460117340087891, "objective/rlhf_reward": -16.514954719573183, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 9.137533187866211, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7552295923233032, "step": 1132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9991569519042969 }, { "episode": 18144, "epoch": 0.32613150231872595, "loss/policy_avg": 0.5607671737670898, "lr": 9.275945807770961e-06, "objective/entropy": -136.83909606933594, "objective/kl": 45.55015563964844, "objective/non_score_reward": -4.555015563964844, "objective/rlhf_reward": -16.097357215658697, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 72.54873657226562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5246527194976807, "step": 1133, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999291181564331 }, { "episode": 18160, "epoch": 0.3264190962361146, "loss/policy_avg": 0.7766623497009277, "lr": 9.275306748466258e-06, "objective/entropy": -162.1307830810547, "objective/kl": 34.953895568847656, "objective/non_score_reward": -3.495389699935913, "objective/rlhf_reward": -12.31969905418216, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.403459072113037, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7220122218132019, "step": 1134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9992647171020508 }, { "episode": 18176, "epoch": 0.32670669015350323, "loss/policy_avg": 0.4037429988384247, "lr": 9.274667689161555e-06, "objective/entropy": -211.73019409179688, "objective/kl": 37.05247116088867, "objective/non_score_reward": -3.705247402191162, "objective/rlhf_reward": -13.479353240042357, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 48.65488052368164, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.630548894405365, "step": 1135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991159439086914 }, { "episode": 18192, "epoch": 0.3269942840708919, "loss/policy_avg": 0.30346184968948364, "lr": 9.274028629856852e-06, "objective/entropy": -319.5671691894531, "objective/kl": 38.04278564453125, "objective/non_score_reward": -3.80427885055542, "objective/rlhf_reward": -13.483781353632608, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.8888806104660034, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6457566022872925, "step": 1136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9993863105773926 }, { "episode": 18208, "epoch": 0.32728187798828057, "loss/policy_avg": 0.6692730188369751, "lr": 9.273389570552149e-06, "objective/entropy": -269.1883544921875, "objective/kl": 44.28985595703125, "objective/non_score_reward": -4.428985595703125, "objective/rlhf_reward": -16.11182168490084, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.870687961578369, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5717663764953613, "step": 1137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0022006034851074 }, { "episode": 18224, "epoch": 0.3275694719056692, "loss/policy_avg": 0.9008448123931885, "lr": 9.272750511247446e-06, "objective/entropy": -348.15283203125, "objective/kl": 43.51081085205078, "objective/non_score_reward": -4.351080894470215, "objective/rlhf_reward": -15.45691354103559, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 18.84325408935547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7703189253807068, "step": 1138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9977266788482666 }, { "episode": 18240, "epoch": 0.32785706582305785, "loss/policy_avg": 0.5011532306671143, "lr": 9.27211145194274e-06, "objective/entropy": -178.40187072753906, "objective/kl": 46.04515075683594, "objective/non_score_reward": -4.604515552520752, "objective/rlhf_reward": -14.01806221008301, "objective/scores": 1.1, "policy/approxkl_avg": 11.445690155029297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6388037204742432, "step": 1139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9991984367370605 }, { "episode": 18256, "epoch": 0.3281446597404465, "loss/policy_avg": -0.21231843531131744, "lr": 9.271472392638038e-06, "objective/entropy": -344.498779296875, "objective/kl": 45.12735366821289, "objective/non_score_reward": -4.512735843658447, "objective/rlhf_reward": -16.103531430439887, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 11.339037895202637, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.679973840713501, "step": 1140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9992462396621704 }, { "episode": 18272, "epoch": 0.3284322536578351, "loss/policy_avg": 0.38732728362083435, "lr": 9.270833333333334e-06, "objective/entropy": -359.1474304199219, "objective/kl": 32.12001037597656, "objective/non_score_reward": -3.212001085281372, "objective/rlhf_reward": -8.448004341125488, "objective/scores": 1.1, "policy/approxkl_avg": 121.40550231933594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6418951749801636, "step": 1141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999131441116333 }, { "episode": 18288, "epoch": 0.32871984757522377, "loss/policy_avg": 0.1380116492509842, "lr": 9.270194274028631e-06, "objective/entropy": -243.837158203125, "objective/kl": 52.0350341796875, "objective/non_score_reward": -5.203503131866455, "objective/rlhf_reward": -20.8140127658844, "objective/scores": 0.0, "policy/approxkl_avg": 13.8850736618042, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6632077693939209, "step": 1142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9975697994232178 }, { "episode": 18304, "epoch": 0.3290074414926124, "loss/policy_avg": -0.4552576541900635, "lr": 9.269555214723928e-06, "objective/entropy": -351.7379150390625, "objective/kl": 31.839832305908203, "objective/non_score_reward": -3.183983325958252, "objective/rlhf_reward": -11.335933542251588, "objective/scores": 0.35, "policy/approxkl_avg": 7.740172386169434, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5034804344177246, "step": 1143, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0076465606689453 }, { "episode": 18320, "epoch": 0.3292950354100011, "loss/policy_avg": 0.8995065093040466, "lr": 9.268916155419223e-06, "objective/entropy": -367.9533386230469, "objective/kl": 46.771636962890625, "objective/non_score_reward": -4.677163600921631, "objective/rlhf_reward": -17.330052950469355, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 15.365102767944336, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5665295124053955, "step": 1144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9998270273208618 }, { "episode": 18336, "epoch": 0.32958262932738974, "loss/policy_avg": 0.18674592673778534, "lr": 9.26827709611452e-06, "objective/entropy": -151.9562530517578, "objective/kl": 42.42345428466797, "objective/non_score_reward": -4.242345333099365, "objective/rlhf_reward": -14.569380617141725, "objective/scores": 0.6, "policy/approxkl_avg": 2.077385902404785, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6581109762191772, "step": 1145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993164539337158 }, { "episode": 18352, "epoch": 0.3298702232447784, "loss/policy_avg": 0.2070343792438507, "lr": 9.267638036809816e-06, "objective/entropy": -196.66079711914062, "objective/kl": 42.46497344970703, "objective/non_score_reward": -4.24649715423584, "objective/rlhf_reward": -16.98598861694336, "objective/scores": 0.0, "policy/approxkl_avg": 21.45248794555664, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6110658645629883, "step": 1146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9995715618133545 }, { "episode": 18368, "epoch": 0.330157817162167, "loss/policy_avg": 0.019901232793927193, "lr": 9.266998977505112e-06, "objective/entropy": -302.61962890625, "objective/kl": 43.04490661621094, "objective/non_score_reward": -4.30449104309082, "objective/rlhf_reward": -14.294244681240293, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.8798656463623047, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6306400299072266, "step": 1147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9995254278182983 }, { "episode": 18384, "epoch": 0.33044541107955566, "loss/policy_avg": -1.071119785308838, "lr": 9.26635991820041e-06, "objective/entropy": -298.47332763671875, "objective/kl": 45.9221076965332, "objective/non_score_reward": -4.5922112464904785, "objective/rlhf_reward": -16.76472500330599, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.062835693359375, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.8481374382972717, "step": 1148, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.0014488697052 }, { "episode": 18400, "epoch": 0.3307330049969443, "loss/policy_avg": 0.1676083207130432, "lr": 9.265720858895706e-06, "objective/entropy": -316.958251953125, "objective/kl": 39.11283493041992, "objective/non_score_reward": -3.911283254623413, "objective/rlhf_reward": -14.245133018493654, "objective/scores": 0.35, "policy/approxkl_avg": 0.5404627323150635, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7265818119049072, "step": 1149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0007596015930176 }, { "episode": 18416, "epoch": 0.33102059891433294, "loss/policy_avg": 0.49536746740341187, "lr": 9.265081799591003e-06, "objective/entropy": -333.7089538574219, "objective/kl": 37.18421936035156, "objective/non_score_reward": -3.7184224128723145, "objective/rlhf_reward": -13.317429869380547, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.429676055908203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8142633438110352, "step": 1150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9982209205627441 }, { "episode": 18432, "epoch": 0.3313081928317216, "loss/policy_avg": 0.6204410195350647, "lr": 9.2644427402863e-06, "objective/entropy": -327.67926025390625, "objective/kl": 41.54536437988281, "objective/non_score_reward": -4.154536247253418, "objective/rlhf_reward": -13.694426928402159, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.728667259216309, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.631436824798584, "step": 1151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.997909426689148 }, { "episode": 18448, "epoch": 0.3315957867491103, "loss/policy_avg": 0.33785608410835266, "lr": 9.263803680981595e-06, "objective/entropy": 53.933189392089844, "objective/kl": 47.77733612060547, "objective/non_score_reward": -4.77773380279541, "objective/rlhf_reward": -17.68710263510522, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.956180572509766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7835606336593628, "step": 1152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975292682647705 }, { "episode": 18464, "epoch": 0.3318833806664989, "loss/policy_avg": -0.2504664361476898, "lr": 9.263164621676892e-06, "objective/entropy": -282.7781066894531, "objective/kl": 45.4371452331543, "objective/non_score_reward": -4.54371452331543, "objective/rlhf_reward": -16.052151860968145, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 8.570236206054688, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5472825765609741, "step": 1153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.003934860229492 }, { "episode": 18480, "epoch": 0.33217097458388756, "loss/policy_avg": 0.9584856033325195, "lr": 9.262525562372189e-06, "objective/entropy": -383.873046875, "objective/kl": 25.311260223388672, "objective/non_score_reward": -2.531126022338867, "objective/rlhf_reward": -10.12450397014618, "objective/scores": 0.0, "policy/approxkl_avg": 72.12400817871094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7414149045944214, "step": 1154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9982688426971436 }, { "episode": 18496, "epoch": 0.3324585685012762, "loss/policy_avg": -0.11891481280326843, "lr": 9.261886503067486e-06, "objective/entropy": -393.4271240234375, "objective/kl": 43.84419250488281, "objective/non_score_reward": -4.3844194412231445, "objective/rlhf_reward": -16.21216491225354, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.7395830154418945, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6612921953201294, "step": 1155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0018463134765625 }, { "episode": 18512, "epoch": 0.33274616241866484, "loss/policy_avg": 0.9828933477401733, "lr": 9.261247443762783e-06, "objective/entropy": -347.4083251953125, "objective/kl": 41.05442810058594, "objective/non_score_reward": -4.105443000793457, "objective/rlhf_reward": -14.596943969997476, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 31.693294525146484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7189889550209045, "step": 1156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9980990886688232 }, { "episode": 18528, "epoch": 0.3330337563360535, "loss/policy_avg": 0.4129069149494171, "lr": 9.260608384458078e-06, "objective/entropy": -378.163330078125, "objective/kl": 45.197265625, "objective/non_score_reward": -4.519725799560547, "objective/rlhf_reward": -16.678904628753664, "objective/scores": 0.35, "policy/approxkl_avg": 10.346860885620117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6512380838394165, "step": 1157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9974095821380615 }, { "episode": 18544, "epoch": 0.3333213502534421, "loss/policy_avg": -0.3025512099266052, "lr": 9.259969325153375e-06, "objective/entropy": -362.97698974609375, "objective/kl": 34.08514404296875, "objective/non_score_reward": -3.4085144996643066, "objective/rlhf_reward": -12.308545622855348, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 72.48345184326172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6691029667854309, "step": 1158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000927448272705 }, { "episode": 18560, "epoch": 0.3336089441708308, "loss/policy_avg": 1.332209587097168, "lr": 9.259330265848672e-06, "objective/entropy": -309.8684997558594, "objective/kl": 39.14338302612305, "objective/non_score_reward": -3.9143381118774414, "objective/rlhf_reward": -11.257352924346923, "objective/scores": 1.1, "policy/approxkl_avg": 23.91999626159668, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7550044059753418, "step": 1159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998896837234497 }, { "episode": 18576, "epoch": 0.33389653808821945, "loss/policy_avg": 0.5401480197906494, "lr": 9.258691206543968e-06, "objective/entropy": -62.96473693847656, "objective/kl": 51.24605941772461, "objective/non_score_reward": -5.124605655670166, "objective/rlhf_reward": -20.498422384262085, "objective/scores": 0.0, "policy/approxkl_avg": 13.64315128326416, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7559372186660767, "step": 1160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999338150024414 }, { "episode": 18592, "epoch": 0.3341841320056081, "loss/policy_avg": 3.0065348148345947, "lr": 9.258052147239265e-06, "objective/entropy": -231.89511108398438, "objective/kl": 43.059139251708984, "objective/non_score_reward": -4.305913925170898, "objective/rlhf_reward": -15.100949945227178, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 6.2150774002075195, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5491085052490234, "step": 1161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9989204406738281 }, { "episode": 18608, "epoch": 0.33447172592299673, "loss/policy_avg": 1.126502513885498, "lr": 9.257413087934562e-06, "objective/entropy": -235.5516357421875, "objective/kl": 51.061973571777344, "objective/non_score_reward": -5.106197357177734, "objective/rlhf_reward": -19.065540277694147, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 49.79182052612305, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7184311151504517, "step": 1162, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9983779191970825 }, { "episode": 18624, "epoch": 0.33475931984038537, "loss/policy_avg": 0.576952338218689, "lr": 9.256774028629857e-06, "objective/entropy": -351.01971435546875, "objective/kl": 47.94415283203125, "objective/non_score_reward": -4.794415473937988, "objective/rlhf_reward": -17.57354167467745, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 35.15521240234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6766231656074524, "step": 1163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9983925819396973 }, { "episode": 18640, "epoch": 0.335046913757774, "loss/policy_avg": 0.1902318000793457, "lr": 9.256134969325154e-06, "objective/entropy": -164.63778686523438, "objective/kl": 49.55584716796875, "objective/non_score_reward": -4.95558500289917, "objective/rlhf_reward": -17.874928544239935, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.018957614898682, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8147241473197937, "step": 1164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0000228881835938 }, { "episode": 18656, "epoch": 0.33533450767516265, "loss/policy_avg": -0.48217761516571045, "lr": 9.255495910020451e-06, "objective/entropy": -386.71514892578125, "objective/kl": 40.849822998046875, "objective/non_score_reward": -4.084981918334961, "objective/rlhf_reward": -11.939928388595582, "objective/scores": 1.1, "policy/approxkl_avg": 7.581989288330078, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6964834928512573, "step": 1165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0013461112976074 }, { "episode": 18672, "epoch": 0.3356221015925513, "loss/policy_avg": -0.511573314666748, "lr": 9.254856850715748e-06, "objective/entropy": -293.1246032714844, "objective/kl": 46.077083587646484, "objective/non_score_reward": -4.607707977294922, "objective/rlhf_reward": -16.915060364993746, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.567927837371826, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5270912647247314, "step": 1166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.0011024475097656 }, { "episode": 18688, "epoch": 0.33590969550994, "loss/policy_avg": -0.2334514558315277, "lr": 9.254217791411043e-06, "objective/entropy": -258.69140625, "objective/kl": 32.220436096191406, "objective/non_score_reward": -3.222043991088867, "objective/rlhf_reward": -9.964456234813902, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.4485650062561035, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7193528413772583, "step": 1167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0031991004943848 }, { "episode": 18704, "epoch": 0.3361972894273286, "loss/policy_avg": 1.0226199626922607, "lr": 9.25357873210634e-06, "objective/entropy": -269.5145568847656, "objective/kl": 29.77992057800293, "objective/non_score_reward": -2.977992296218872, "objective/rlhf_reward": -8.98825005138037, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.199236869812012, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6432288289070129, "step": 1168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985134601593018 }, { "episode": 18720, "epoch": 0.33648488334471727, "loss/policy_avg": 0.7639418840408325, "lr": 9.252939672801637e-06, "objective/entropy": -289.8372802734375, "objective/kl": 32.69376754760742, "objective/non_score_reward": -3.269376754760742, "objective/rlhf_reward": -11.71825727197973, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 6.762063980102539, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7863032817840576, "step": 1169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000936985015869 }, { "episode": 18736, "epoch": 0.3367724772621059, "loss/policy_avg": 1.262454867362976, "lr": 9.252300613496932e-06, "objective/entropy": -402.82830810546875, "objective/kl": 40.695125579833984, "objective/non_score_reward": -4.069512367248535, "objective/rlhf_reward": -16.278050422668457, "objective/scores": 0.0, "policy/approxkl_avg": 4.003758907318115, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5551731586456299, "step": 1170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9989738464355469 }, { "episode": 18752, "epoch": 0.33706007117949455, "loss/policy_avg": 0.45785659551620483, "lr": 9.251661554192229e-06, "objective/entropy": -164.36351013183594, "objective/kl": 53.677452087402344, "objective/non_score_reward": -5.367745399475098, "objective/rlhf_reward": -20.020383696170196, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.0048176050186157, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6020734906196594, "step": 1171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0043349266052246 }, { "episode": 18768, "epoch": 0.3373476650968832, "loss/policy_avg": 0.05135174095630646, "lr": 9.251022494887526e-06, "objective/entropy": -320.96697998046875, "objective/kl": 43.27254867553711, "objective/non_score_reward": -4.3272552490234375, "objective/rlhf_reward": -15.5756867090861, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.85768985748291, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6983485817909241, "step": 1172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9998271465301514 }, { "episode": 18784, "epoch": 0.3376352590142718, "loss/policy_avg": 0.7015002965927124, "lr": 9.250383435582823e-06, "objective/entropy": -400.7436218261719, "objective/kl": 40.43014907836914, "objective/non_score_reward": -4.043015480041504, "objective/rlhf_reward": -14.846547637015505, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 12.2098970413208, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5949186086654663, "step": 1173, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993301630020142 }, { "episode": 18800, "epoch": 0.3379228529316605, "loss/policy_avg": 0.15271344780921936, "lr": 9.24974437627812e-06, "objective/entropy": -206.08641052246094, "objective/kl": 41.03256607055664, "objective/non_score_reward": -4.103257179260254, "objective/rlhf_reward": -15.013027524948122, "objective/scores": 0.35, "policy/approxkl_avg": 11.347776412963867, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6801156997680664, "step": 1174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998974323272705 }, { "episode": 18816, "epoch": 0.33821044684904916, "loss/policy_avg": 1.3508622646331787, "lr": 9.249105316973417e-06, "objective/entropy": -301.5340881347656, "objective/kl": 58.91084671020508, "objective/non_score_reward": -5.891084671020508, "objective/rlhf_reward": -22.008080094066216, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 89.3551025390625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6079524755477905, "step": 1175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997277021408081 }, { "episode": 18832, "epoch": 0.3384980407664378, "loss/policy_avg": 0.6425105333328247, "lr": 9.248466257668712e-06, "objective/entropy": -136.189453125, "objective/kl": 37.78475570678711, "objective/non_score_reward": -3.778475761413574, "objective/rlhf_reward": -15.113902568817139, "objective/scores": 0.0, "policy/approxkl_avg": 60.16349792480469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5068131685256958, "step": 1176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9960620403289795 }, { "episode": 18848, "epoch": 0.33878563468382644, "loss/policy_avg": 1.4316264390945435, "lr": 9.247827198364009e-06, "objective/entropy": -303.93792724609375, "objective/kl": 34.397884368896484, "objective/non_score_reward": -3.439788579940796, "objective/rlhf_reward": -10.835435305477354, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 8.966455459594727, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7622511982917786, "step": 1177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9990615844726562 }, { "episode": 18864, "epoch": 0.3390732286012151, "loss/policy_avg": 0.4903183579444885, "lr": 9.247188139059305e-06, "objective/entropy": -358.79815673828125, "objective/kl": 41.87967300415039, "objective/non_score_reward": -4.187967300415039, "objective/rlhf_reward": -12.35186860561371, "objective/scores": 1.1, "policy/approxkl_avg": 1.9207717180252075, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7472304701805115, "step": 1178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0006864070892334 }, { "episode": 18880, "epoch": 0.3393608225186037, "loss/policy_avg": 1.3057444095611572, "lr": 9.246549079754602e-06, "objective/entropy": -248.4669189453125, "objective/kl": 40.090850830078125, "objective/non_score_reward": -4.009085178375244, "objective/rlhf_reward": -16.036340832710266, "objective/scores": 0.0, "policy/approxkl_avg": 12.786033630371094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5493189692497253, "step": 1179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9977819919586182 }, { "episode": 18896, "epoch": 0.33964841643599236, "loss/policy_avg": 0.3325832784175873, "lr": 9.2459100204499e-06, "objective/entropy": -348.3007507324219, "objective/kl": 46.740692138671875, "objective/non_score_reward": -4.674069404602051, "objective/rlhf_reward": -16.96294428507487, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 14.970390319824219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4214663505554199, "step": 1180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9992746114730835 }, { "episode": 18912, "epoch": 0.339936010353381, "loss/policy_avg": 0.9018368124961853, "lr": 9.245270961145194e-06, "objective/entropy": -379.12945556640625, "objective/kl": 37.683021545410156, "objective/non_score_reward": -3.7683022022247314, "objective/rlhf_reward": -13.516949503627373, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.6109604835510254, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6474652290344238, "step": 1181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991075992584229 }, { "episode": 18928, "epoch": 0.3402236042707697, "loss/policy_avg": -0.01987364888191223, "lr": 9.244631901840491e-06, "objective/entropy": -162.23797607421875, "objective/kl": 53.15696716308594, "objective/non_score_reward": -5.315697193145752, "objective/rlhf_reward": -19.812189678759918, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 1.1874985694885254, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.43135493993759155, "step": 1182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0008223056793213 }, { "episode": 18944, "epoch": 0.34051119818815834, "loss/policy_avg": 0.1262747347354889, "lr": 9.243992842535788e-06, "objective/entropy": -297.56280517578125, "objective/kl": 49.20505142211914, "objective/non_score_reward": -4.920505523681641, "objective/rlhf_reward": -18.340384772329955, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.4043607711791992, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5816237926483154, "step": 1183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0012969970703125 }, { "episode": 18960, "epoch": 0.340798792105547, "loss/policy_avg": 0.9054136872291565, "lr": 9.243353783231085e-06, "objective/entropy": -217.55908203125, "objective/kl": 46.04097366333008, "objective/non_score_reward": -4.604097366333008, "objective/rlhf_reward": -17.090877327948732, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 39.04450607299805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.436450719833374, "step": 1184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9987715482711792 }, { "episode": 18976, "epoch": 0.3410863860229356, "loss/policy_avg": 0.27962133288383484, "lr": 9.242714723926382e-06, "objective/entropy": -304.0128479003906, "objective/kl": 42.55516052246094, "objective/non_score_reward": -4.255516052246094, "objective/rlhf_reward": -15.662815296386164, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 0.5524175763130188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7890490889549255, "step": 1185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9997435808181763 }, { "episode": 18992, "epoch": 0.34137397994032426, "loss/policy_avg": 0.13798058032989502, "lr": 9.242075664621679e-06, "objective/entropy": -189.5888671875, "objective/kl": 42.00756072998047, "objective/non_score_reward": -4.200756072998047, "objective/rlhf_reward": -14.40302381515503, "objective/scores": 0.6, "policy/approxkl_avg": 171.9234161376953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8484805822372437, "step": 1186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999582052230835 }, { "episode": 19008, "epoch": 0.3416615738577129, "loss/policy_avg": 0.10782553255558014, "lr": 9.241436605316974e-06, "objective/entropy": -279.416015625, "objective/kl": 41.35896301269531, "objective/non_score_reward": -4.135896682739258, "objective/rlhf_reward": -14.987326710429741, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.176931381225586, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5773874521255493, "step": 1187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9995372295379639 }, { "episode": 19024, "epoch": 0.34194916777510154, "loss/policy_avg": 0.7925869226455688, "lr": 9.240797546012271e-06, "objective/entropy": -337.82208251953125, "objective/kl": 40.824974060058594, "objective/non_score_reward": -4.082497596740723, "objective/rlhf_reward": -14.879392842860565, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.134884834289551, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.749229907989502, "step": 1188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999743938446045 }, { "episode": 19040, "epoch": 0.3422367616924902, "loss/policy_avg": 0.6008885502815247, "lr": 9.240158486707568e-06, "objective/entropy": -177.244873046875, "objective/kl": 27.401851654052734, "objective/non_score_reward": -2.740185260772705, "objective/rlhf_reward": -9.560741639137268, "objective/scores": 0.35, "policy/approxkl_avg": 21.086898803710938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6006038188934326, "step": 1189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9966230392456055 }, { "episode": 19056, "epoch": 0.34252435560987887, "loss/policy_avg": 0.5306156277656555, "lr": 9.239519427402863e-06, "objective/entropy": -266.4140625, "objective/kl": 42.718074798583984, "objective/non_score_reward": -4.271807670593262, "objective/rlhf_reward": -17.087230324745178, "objective/scores": 0.0, "policy/approxkl_avg": 4.91014289855957, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.715212345123291, "step": 1190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9987106323242188 }, { "episode": 19072, "epoch": 0.3428119495272675, "loss/policy_avg": -0.033838436007499695, "lr": 9.23888036809816e-06, "objective/entropy": -311.21343994140625, "objective/kl": 43.236907958984375, "objective/non_score_reward": -4.323691368103027, "objective/rlhf_reward": -17.294764757156372, "objective/scores": 0.0, "policy/approxkl_avg": 37.34124755859375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.610100269317627, "step": 1191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9982128143310547 }, { "episode": 19088, "epoch": 0.34309954344465615, "loss/policy_avg": 0.5805760622024536, "lr": 9.238241308793457e-06, "objective/entropy": -343.1019287109375, "objective/kl": 33.77574920654297, "objective/non_score_reward": -3.377574920654297, "objective/rlhf_reward": -12.059701542468414, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 22.01586151123047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5611630082130432, "step": 1192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9978299140930176 }, { "episode": 19104, "epoch": 0.3433871373620448, "loss/policy_avg": 1.2938847541809082, "lr": 9.237602249488754e-06, "objective/entropy": -62.264404296875, "objective/kl": 41.93548583984375, "objective/non_score_reward": -4.193548679351807, "objective/rlhf_reward": -15.414945089553278, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 86.04387664794922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7683408260345459, "step": 1193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000417947769165 }, { "episode": 19120, "epoch": 0.34367473127943343, "loss/policy_avg": 1.5487582683563232, "lr": 9.236963190184049e-06, "objective/entropy": -289.3804931640625, "objective/kl": 49.80120849609375, "objective/non_score_reward": -4.98012113571167, "objective/rlhf_reward": -18.36422452231939, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.569292068481445, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.46806010603904724, "step": 1194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9995393753051758 }, { "episode": 19136, "epoch": 0.34396232519682207, "loss/policy_avg": 0.24192482233047485, "lr": 9.236324130879346e-06, "objective/entropy": -215.54275512695312, "objective/kl": 53.197200775146484, "objective/non_score_reward": -5.319720268249512, "objective/rlhf_reward": -19.331469844059882, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 39.338043212890625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5865123271942139, "step": 1195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999040126800537 }, { "episode": 19152, "epoch": 0.3442499191142107, "loss/policy_avg": 0.40249311923980713, "lr": 9.235685071574642e-06, "objective/entropy": -209.94752502441406, "objective/kl": 40.4586181640625, "objective/non_score_reward": -4.045861721038818, "objective/rlhf_reward": -16.183446645736694, "objective/scores": 0.0, "policy/approxkl_avg": 5.413271903991699, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6579909324645996, "step": 1196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9998447895050049 }, { "episode": 19168, "epoch": 0.3445375130315994, "loss/policy_avg": 0.32685503363609314, "lr": 9.23504601226994e-06, "objective/entropy": -197.3863525390625, "objective/kl": 47.421058654785156, "objective/non_score_reward": -4.742105484008789, "objective/rlhf_reward": -17.36430219179781, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 79.58370208740234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8912713527679443, "step": 1197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9972530603408813 }, { "episode": 19184, "epoch": 0.34482510694898805, "loss/policy_avg": 0.28150755167007446, "lr": 9.234406952965236e-06, "objective/entropy": -233.89865112304688, "objective/kl": 46.691917419433594, "objective/non_score_reward": -4.669192314147949, "objective/rlhf_reward": -16.943435923258463, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 23.83869171142578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6185580492019653, "step": 1198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9979031085968018 }, { "episode": 19200, "epoch": 0.3451127008663767, "loss/policy_avg": 3.03059720993042, "lr": 9.233767893660533e-06, "objective/entropy": -252.32183837890625, "objective/kl": 49.510231018066406, "objective/non_score_reward": -4.951022624969482, "objective/rlhf_reward": -18.070757643381754, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 74.62678527832031, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7264424562454224, "step": 1199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9983851909637451 }, { "episode": 19216, "epoch": 0.3454002947837653, "loss/policy_avg": 0.04245274141430855, "lr": 9.233128834355828e-06, "objective/entropy": -363.91082763671875, "objective/kl": 39.50959014892578, "objective/non_score_reward": -3.9509592056274414, "objective/rlhf_reward": -13.681130828634773, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 4.117827415466309, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6777689456939697, "step": 1200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9978601932525635 }, { "episode": 19232, "epoch": 0.34568788870115397, "loss/policy_avg": 0.6620116829872131, "lr": 9.232489775051125e-06, "objective/entropy": -296.2205505371094, "objective/kl": 37.862022399902344, "objective/non_score_reward": -3.7862019538879395, "objective/rlhf_reward": -13.319978828701089, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 44.392982482910156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7481371164321899, "step": 1201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.998541235923767 }, { "episode": 19248, "epoch": 0.3459754826185426, "loss/policy_avg": 0.7233671545982361, "lr": 9.231850715746422e-06, "objective/entropy": -280.6005859375, "objective/kl": 52.35272979736328, "objective/non_score_reward": -5.235273361206055, "objective/rlhf_reward": -19.336973939005453, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 8.845357894897461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5984178781509399, "step": 1202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9979088306427002 }, { "episode": 19264, "epoch": 0.34626307653593125, "loss/policy_avg": 0.1777988076210022, "lr": 9.231211656441719e-06, "objective/entropy": -296.9227294921875, "objective/kl": 40.9736328125, "objective/non_score_reward": -4.097362995147705, "objective/rlhf_reward": -14.873679721149141, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 4.720394134521484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9094287157058716, "step": 1203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.9991389513015747 }, { "episode": 19280, "epoch": 0.3465506704533199, "loss/policy_avg": 2.0115551948547363, "lr": 9.230572597137016e-06, "objective/entropy": -268.8210144042969, "objective/kl": 52.00248718261719, "objective/non_score_reward": -5.200248718261719, "objective/rlhf_reward": -19.45935921958032, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 13.99363899230957, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626105785369873, "step": 1204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9985575675964355 }, { "episode": 19296, "epoch": 0.3468382643707086, "loss/policy_avg": 0.13095124065876007, "lr": 9.229933537832311e-06, "objective/entropy": -323.0357666015625, "objective/kl": 39.02614974975586, "objective/non_score_reward": -3.9026148319244385, "objective/rlhf_reward": -14.05420026084478, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.40045166015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7898708581924438, "step": 1205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998520016670227 }, { "episode": 19312, "epoch": 0.3471258582880972, "loss/policy_avg": 0.025498755276203156, "lr": 9.229294478527608e-06, "objective/entropy": -376.06976318359375, "objective/kl": 41.15854263305664, "objective/non_score_reward": -4.115854263305664, "objective/rlhf_reward": -14.947645270618136, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.1531376838684082, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6971213817596436, "step": 1206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.00040864944458 }, { "episode": 19328, "epoch": 0.34741345220548586, "loss/policy_avg": 0.11429200321435928, "lr": 9.228655419222905e-06, "objective/entropy": -297.4051513671875, "objective/kl": 39.18128967285156, "objective/non_score_reward": -3.9181292057037354, "objective/rlhf_reward": -14.293914415923457, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.703296184539795, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6841511726379395, "step": 1207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0045528411865234 }, { "episode": 19344, "epoch": 0.3477010461228745, "loss/policy_avg": 0.28762081265449524, "lr": 9.228016359918202e-06, "objective/entropy": -374.4822082519531, "objective/kl": 36.94370651245117, "objective/non_score_reward": -3.694370746612549, "objective/rlhf_reward": -12.952654238018106, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 24.103839874267578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7110795974731445, "step": 1208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9981788396835327 }, { "episode": 19360, "epoch": 0.34798864004026314, "loss/policy_avg": 0.10788074135780334, "lr": 9.227377300613499e-06, "objective/entropy": -375.04742431640625, "objective/kl": 42.19403839111328, "objective/non_score_reward": -4.219404220581055, "objective/rlhf_reward": -15.499013760176998, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 25.455957412719727, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7330628037452698, "step": 1209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9995394945144653 }, { "episode": 19376, "epoch": 0.3482762339576518, "loss/policy_avg": 1.4270000457763672, "lr": 9.226738241308795e-06, "objective/entropy": -103.56452178955078, "objective/kl": 52.71872329711914, "objective/non_score_reward": -5.271872520446777, "objective/rlhf_reward": -19.571718060764013, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 10.217245101928711, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5531854629516602, "step": 1210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983043670654297 }, { "episode": 19392, "epoch": 0.3485638278750404, "loss/policy_avg": 1.023619532585144, "lr": 9.22609918200409e-06, "objective/entropy": -337.45660400390625, "objective/kl": 42.48429870605469, "objective/non_score_reward": -4.248429775238037, "objective/rlhf_reward": -15.593719577789308, "objective/scores": 0.35, "policy/approxkl_avg": 7.106929779052734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8562592267990112, "step": 1211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9985864162445068 }, { "episode": 19408, "epoch": 0.3488514217924291, "loss/policy_avg": 0.21168407797813416, "lr": 9.225460122699387e-06, "objective/entropy": -173.31109619140625, "objective/kl": 42.14552307128906, "objective/non_score_reward": -4.21455192565918, "objective/rlhf_reward": -16.858208656311035, "objective/scores": 0.0, "policy/approxkl_avg": 2.470315933227539, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6201061010360718, "step": 1212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000545978546143 }, { "episode": 19424, "epoch": 0.34913901570981776, "loss/policy_avg": 0.22564297914505005, "lr": 9.224821063394683e-06, "objective/entropy": -156.56170654296875, "objective/kl": 49.94196701049805, "objective/non_score_reward": -4.994196891784668, "objective/rlhf_reward": -18.576787567138673, "objective/scores": 0.35, "policy/approxkl_avg": 8.744648933410645, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5609461069107056, "step": 1213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9996341466903687 }, { "episode": 19440, "epoch": 0.3494266096272064, "loss/policy_avg": 0.9587266445159912, "lr": 9.22418200408998e-06, "objective/entropy": -143.19790649414062, "objective/kl": 46.37242889404297, "objective/non_score_reward": -4.637243270874023, "objective/rlhf_reward": -17.09837494334732, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 18.623043060302734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6094132661819458, "step": 1214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9985774755477905 }, { "episode": 19456, "epoch": 0.34971420354459504, "loss/policy_avg": 1.7824742794036865, "lr": 9.223542944785276e-06, "objective/entropy": -157.23646545410156, "objective/kl": 40.71059799194336, "objective/non_score_reward": -4.0710601806640625, "objective/rlhf_reward": -14.803286793644787, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 27.44439697265625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5854488015174866, "step": 1215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9976880550384521 }, { "episode": 19472, "epoch": 0.3500017974619837, "loss/policy_avg": 0.9863942861557007, "lr": 9.222903885480573e-06, "objective/entropy": -339.10675048828125, "objective/kl": 39.569740295410156, "objective/non_score_reward": -3.9569742679595947, "objective/rlhf_reward": -13.42789731025696, "objective/scores": 0.6, "policy/approxkl_avg": 14.651338577270508, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5935852527618408, "step": 1216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.997494101524353 }, { "episode": 19488, "epoch": 0.3502893913793723, "loss/policy_avg": 2.450385332107544, "lr": 9.22226482617587e-06, "objective/entropy": -211.38162231445312, "objective/kl": 37.93001174926758, "objective/non_score_reward": -3.793001174926758, "objective/rlhf_reward": -13.793402769652705, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 32.12030792236328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6932085156440735, "step": 1217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9978573322296143 }, { "episode": 19504, "epoch": 0.35057698529676096, "loss/policy_avg": -0.6133419275283813, "lr": 9.221625766871165e-06, "objective/entropy": -215.69464111328125, "objective/kl": 47.69563293457031, "objective/non_score_reward": -4.769562721252441, "objective/rlhf_reward": -17.562479579242403, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.877052068710327, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.41634631156921387, "step": 1218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0006275177001953 }, { "episode": 19520, "epoch": 0.3508645792141496, "loss/policy_avg": -0.5479239821434021, "lr": 9.220986707566462e-06, "objective/entropy": -179.13613891601562, "objective/kl": 43.520530700683594, "objective/non_score_reward": -4.352053165435791, "objective/rlhf_reward": -15.95761452159439, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.914542198181152, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6683953404426575, "step": 1219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0006399154663086 }, { "episode": 19536, "epoch": 0.3511521731315383, "loss/policy_avg": 0.5882867574691772, "lr": 9.220347648261759e-06, "objective/entropy": -169.6424560546875, "objective/kl": 54.354270935058594, "objective/non_score_reward": -5.435427665710449, "objective/rlhf_reward": -20.416197094947023, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.5751432180404663, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7578393816947937, "step": 1220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999896764755249 }, { "episode": 19552, "epoch": 0.35143976704892693, "loss/policy_avg": 2.956665515899658, "lr": 9.219708588957056e-06, "objective/entropy": -427.91717529296875, "objective/kl": 37.71200180053711, "objective/non_score_reward": -3.771200656890869, "objective/rlhf_reward": -13.569030129703219, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 36.06583023071289, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.523029088973999, "step": 1221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9991230964660645 }, { "episode": 19568, "epoch": 0.35172736096631557, "loss/policy_avg": -0.12964464724063873, "lr": 9.219069529652353e-06, "objective/entropy": 37.52610778808594, "objective/kl": 35.717201232910156, "objective/non_score_reward": -3.5717201232910156, "objective/rlhf_reward": -11.363160525203917, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.609069347381592, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.417826384305954, "step": 1222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0027952194213867 }, { "episode": 19584, "epoch": 0.3520149548837042, "loss/policy_avg": -0.018821537494659424, "lr": 9.21843047034765e-06, "objective/entropy": -155.95465087890625, "objective/kl": 47.509437561035156, "objective/non_score_reward": -4.750944137573242, "objective/rlhf_reward": -16.080058012844297, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.908452033996582, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5225849151611328, "step": 1223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000640630722046 }, { "episode": 19600, "epoch": 0.35230254880109285, "loss/policy_avg": 0.682684063911438, "lr": 9.217791411042945e-06, "objective/entropy": -371.6395568847656, "objective/kl": 33.488853454589844, "objective/non_score_reward": -3.3488855361938477, "objective/rlhf_reward": -11.272836150900396, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 52.392059326171875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6742763519287109, "step": 1224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9980804920196533 }, { "episode": 19616, "epoch": 0.3525901427184815, "loss/policy_avg": 0.07087349891662598, "lr": 9.217152351738242e-06, "objective/entropy": -209.95709228515625, "objective/kl": 40.607330322265625, "objective/non_score_reward": -4.060732841491699, "objective/rlhf_reward": -14.84293112754822, "objective/scores": 0.35, "policy/approxkl_avg": 4.942096710205078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7620400190353394, "step": 1225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999511957168579 }, { "episode": 19632, "epoch": 0.35287773663587013, "loss/policy_avg": 0.12689802050590515, "lr": 9.216513292433539e-06, "objective/entropy": -281.868896484375, "objective/kl": 50.71034240722656, "objective/non_score_reward": -5.0710344314575195, "objective/rlhf_reward": -20.284136533737183, "objective/scores": 0.0, "policy/approxkl_avg": 25.16387176513672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.42706865072250366, "step": 1226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989101886749268 }, { "episode": 19648, "epoch": 0.35316533055325877, "loss/policy_avg": -1.2583229541778564, "lr": 9.215874233128836e-06, "objective/entropy": -297.2579040527344, "objective/kl": 45.07452392578125, "objective/non_score_reward": -4.507452487945557, "objective/rlhf_reward": -15.106090699077818, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 16.338523864746094, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6911603212356567, "step": 1227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0020761489868164 }, { "episode": 19664, "epoch": 0.35345292447064747, "loss/policy_avg": 0.7108262181282043, "lr": 9.215235173824132e-06, "objective/entropy": -236.56329345703125, "objective/kl": 37.1826286315918, "objective/non_score_reward": -3.7182631492614746, "objective/rlhf_reward": -13.139719382921854, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.697059631347656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7521039843559265, "step": 1228, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.998537302017212 }, { "episode": 19680, "epoch": 0.3537405183880361, "loss/policy_avg": -0.812086820602417, "lr": 9.21459611451943e-06, "objective/entropy": -143.27023315429688, "objective/kl": 39.98391342163086, "objective/non_score_reward": -3.9983913898468018, "objective/rlhf_reward": -14.634316050742548, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 5.659043312072754, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.777701735496521, "step": 1229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0163586139678955 }, { "episode": 19696, "epoch": 0.35402811230542475, "loss/policy_avg": -0.1462898999452591, "lr": 9.213957055214725e-06, "objective/entropy": -355.7449951171875, "objective/kl": 36.1224250793457, "objective/non_score_reward": -3.6122426986694336, "objective/rlhf_reward": -13.070368626204829, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 37.19526290893555, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7280309200286865, "step": 1230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999755859375 }, { "episode": 19712, "epoch": 0.3543157062228134, "loss/policy_avg": 0.3494728207588196, "lr": 9.213317995910021e-06, "objective/entropy": -322.95806884765625, "objective/kl": 43.021759033203125, "objective/non_score_reward": -4.302175998687744, "objective/rlhf_reward": -15.085997285620245, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 28.19770050048828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5790446400642395, "step": 1231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9979618787765503 }, { "episode": 19728, "epoch": 0.354603300140202, "loss/policy_avg": 0.834531307220459, "lr": 9.212678936605318e-06, "objective/entropy": -387.41314697265625, "objective/kl": 29.125343322753906, "objective/non_score_reward": -2.912534236907959, "objective/rlhf_reward": -10.271535494414668, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.478957176208496, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6434707641601562, "step": 1232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.9994601011276245 }, { "episode": 19744, "epoch": 0.35489089405759067, "loss/policy_avg": 1.6608116626739502, "lr": 9.212039877300615e-06, "objective/entropy": -231.29391479492188, "objective/kl": 41.10798645019531, "objective/non_score_reward": -4.1107988357543945, "objective/rlhf_reward": -14.618364925655435, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 11.099143028259277, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4798949956893921, "step": 1233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9997162818908691 }, { "episode": 19760, "epoch": 0.3551784879749793, "loss/policy_avg": -0.27888932824134827, "lr": 9.21140081799591e-06, "objective/entropy": -264.94256591796875, "objective/kl": 54.752723693847656, "objective/non_score_reward": -5.475272178649902, "objective/rlhf_reward": -20.296968731943686, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.037303447723389, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5578334331512451, "step": 1234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9999535083770752 }, { "episode": 19776, "epoch": 0.355466081892368, "loss/policy_avg": 0.3479851484298706, "lr": 9.210761758691207e-06, "objective/entropy": -308.5455627441406, "objective/kl": 37.39442443847656, "objective/non_score_reward": -3.7394423484802246, "objective/rlhf_reward": -10.557770109176635, "objective/scores": 1.1, "policy/approxkl_avg": 9.637372970581055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6503261923789978, "step": 1235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.998403787612915 }, { "episode": 19792, "epoch": 0.35575367580975664, "loss/policy_avg": 0.3700852394104004, "lr": 9.210122699386504e-06, "objective/entropy": -192.52035522460938, "objective/kl": 39.61968231201172, "objective/non_score_reward": -3.961968421936035, "objective/rlhf_reward": -13.725167097822698, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 10.894691467285156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.780241847038269, "step": 1236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9981834888458252 }, { "episode": 19808, "epoch": 0.3560412697271453, "loss/policy_avg": -0.11194395273923874, "lr": 9.2094836400818e-06, "objective/entropy": -353.60296630859375, "objective/kl": 15.605920791625977, "objective/non_score_reward": -1.5605919361114502, "objective/rlhf_reward": -4.842368102073669, "objective/scores": 0.35, "policy/approxkl_avg": 9.558113098144531, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8214880228042603, "step": 1237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.999047875404358 }, { "episode": 19824, "epoch": 0.3563288636445339, "loss/policy_avg": 0.09466155618429184, "lr": 9.208844580777096e-06, "objective/entropy": -271.57666015625, "objective/kl": 31.521411895751953, "objective/non_score_reward": -3.1521410942077637, "objective/rlhf_reward": -10.946705584943878, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.6647242307662964, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6003366708755493, "step": 1238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0003809928894043 }, { "episode": 19840, "epoch": 0.35661645756192256, "loss/policy_avg": -0.24403750896453857, "lr": 9.208205521472393e-06, "objective/entropy": -300.3720397949219, "objective/kl": 40.926780700683594, "objective/non_score_reward": -4.092678070068359, "objective/rlhf_reward": -16.370712757110596, "objective/scores": 0.0, "policy/approxkl_avg": 6.308424472808838, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5148023962974548, "step": 1239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0003509521484375 }, { "episode": 19856, "epoch": 0.3569040514793112, "loss/policy_avg": -0.29317817091941833, "lr": 9.20756646216769e-06, "objective/entropy": -320.2080383300781, "objective/kl": 37.338130950927734, "objective/non_score_reward": -3.7338132858276367, "objective/rlhf_reward": -13.378994553294731, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 11.110733032226562, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6728114485740662, "step": 1240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0074212551116943 }, { "episode": 19872, "epoch": 0.35719164539669984, "loss/policy_avg": 0.6977443099021912, "lr": 9.206927402862987e-06, "objective/entropy": -253.653076171875, "objective/kl": 34.72311782836914, "objective/non_score_reward": -3.4723119735717773, "objective/rlhf_reward": -12.332989542689873, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.2154245376586914, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7579048871994019, "step": 1241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9991308450698853 }, { "episode": 19888, "epoch": 0.3574792393140885, "loss/policy_avg": 0.31379836797714233, "lr": 9.206288343558284e-06, "objective/entropy": -382.63153076171875, "objective/kl": 35.638206481933594, "objective/non_score_reward": -3.5638208389282227, "objective/rlhf_reward": -12.699024288859919, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.8277387619018555, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6055066585540771, "step": 1242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9976744651794434 }, { "episode": 19904, "epoch": 0.3577668332314772, "loss/policy_avg": 0.6324791312217712, "lr": 9.205649284253579e-06, "objective/entropy": -335.550537109375, "objective/kl": 34.992305755615234, "objective/non_score_reward": -3.4992306232452393, "objective/rlhf_reward": -12.172093506130288, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 14.330108642578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7360525131225586, "step": 1243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9994398355484009 }, { "episode": 19920, "epoch": 0.3580544271488658, "loss/policy_avg": 0.4071337580680847, "lr": 9.205010224948876e-06, "objective/entropy": -462.183837890625, "objective/kl": 38.8656120300293, "objective/non_score_reward": -3.886561393737793, "objective/rlhf_reward": -15.546245574951172, "objective/scores": 0.0, "policy/approxkl_avg": 20.43769645690918, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5784210562705994, "step": 1244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9969539642333984 }, { "episode": 19936, "epoch": 0.35834202106625446, "loss/policy_avg": 0.46235209703445435, "lr": 9.204371165644173e-06, "objective/entropy": -255.4346923828125, "objective/kl": 47.181739807128906, "objective/non_score_reward": -4.718174457550049, "objective/rlhf_reward": -17.547184023886842, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 90.3627700805664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5365368723869324, "step": 1245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9959521293640137 }, { "episode": 19952, "epoch": 0.3586296149836431, "loss/policy_avg": 5.58965539932251, "lr": 9.20373210633947e-06, "objective/entropy": -187.67587280273438, "objective/kl": 51.5475959777832, "objective/non_score_reward": -5.154759883880615, "objective/rlhf_reward": -19.19520695944604, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 10.334053039550781, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7041340470314026, "step": 1246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9976823329925537 }, { "episode": 19968, "epoch": 0.35891720890103174, "loss/policy_avg": 0.3098609447479248, "lr": 9.203093047034766e-06, "objective/entropy": -345.39776611328125, "objective/kl": 37.433502197265625, "objective/non_score_reward": -3.743350028991699, "objective/rlhf_reward": -12.573400235176088, "objective/scores": 0.6, "policy/approxkl_avg": 0.7677746415138245, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7975311279296875, "step": 1247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0004148483276367 }, { "episode": 19984, "epoch": 0.3592048028184204, "loss/policy_avg": 0.020045127719640732, "lr": 9.202453987730062e-06, "objective/entropy": -99.05142211914062, "objective/kl": 42.77995300292969, "objective/non_score_reward": -4.2779951095581055, "objective/rlhf_reward": -15.507860336367209, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 17.719921112060547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7262557148933411, "step": 1248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999335527420044 }, { "episode": 20000, "epoch": 0.359492396735809, "loss/policy_avg": 1.0318403244018555, "lr": 9.201814928425358e-06, "objective/entropy": -179.522705078125, "objective/kl": 54.78431701660156, "objective/non_score_reward": -5.478431701660156, "objective/rlhf_reward": -20.46312818965469, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.616248846054077, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4878614544868469, "step": 1249, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9980541467666626 }, { "episode": 20016, "epoch": 0.3597799906531977, "loss/policy_avg": 10.035224914550781, "lr": 9.201175869120655e-06, "objective/entropy": -183.2954559326172, "objective/kl": 39.52976608276367, "objective/non_score_reward": -3.9529762268066406, "objective/rlhf_reward": -13.689199628607305, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 38.14386749267578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5370172262191772, "step": 1250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0050442218780518 }, { "episode": 20032, "epoch": 0.36006758457058635, "loss/policy_avg": 1.4021141529083252, "lr": 9.200536809815952e-06, "objective/entropy": -203.68487548828125, "objective/kl": 51.40492248535156, "objective/non_score_reward": -5.1404924392700195, "objective/rlhf_reward": -18.73714053181083, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 24.684858322143555, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5602384805679321, "step": 1251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0000431537628174 }, { "episode": 20048, "epoch": 0.360355178487975, "loss/policy_avg": 0.2842300236225128, "lr": 9.199897750511249e-06, "objective/entropy": -192.44725036621094, "objective/kl": 48.638309478759766, "objective/non_score_reward": -4.86383056640625, "objective/rlhf_reward": -17.939551436694796, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.930276870727539, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7645981311798096, "step": 1252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000037908554077 }, { "episode": 20064, "epoch": 0.36064277240536363, "loss/policy_avg": 1.5568498373031616, "lr": 9.199258691206546e-06, "objective/entropy": -397.03009033203125, "objective/kl": 34.783382415771484, "objective/non_score_reward": -3.4783384799957275, "objective/rlhf_reward": -12.587841186553163, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.947456359863281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6179549694061279, "step": 1253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0005686283111572 }, { "episode": 20080, "epoch": 0.36093036632275227, "loss/policy_avg": 0.6185894012451172, "lr": 9.198619631901841e-06, "objective/entropy": -143.19839477539062, "objective/kl": 50.01347732543945, "objective/non_score_reward": -5.00134801864624, "objective/rlhf_reward": -18.663756421118407, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 22.466060638427734, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6080735325813293, "step": 1254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987685680389404 }, { "episode": 20096, "epoch": 0.3612179602401409, "loss/policy_avg": 0.6289013028144836, "lr": 9.197980572597138e-06, "objective/entropy": -90.98287963867188, "objective/kl": 52.32436752319336, "objective/non_score_reward": -5.2324371337890625, "objective/rlhf_reward": -19.58811192801538, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 101.09732818603516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9253355264663696, "step": 1255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9966038465499878 }, { "episode": 20112, "epoch": 0.36150555415752955, "loss/policy_avg": 0.08400177955627441, "lr": 9.197341513292433e-06, "objective/entropy": -267.1910705566406, "objective/kl": 45.014888763427734, "objective/non_score_reward": -4.50148868560791, "objective/rlhf_reward": -16.627352097121577, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 19.924392700195312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6691485643386841, "step": 1256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9983479976654053 }, { "episode": 20128, "epoch": 0.3617931480749182, "loss/policy_avg": 0.039717625826597214, "lr": 9.19670245398773e-06, "objective/entropy": -391.6705322265625, "objective/kl": 36.95433044433594, "objective/non_score_reward": -3.6954331398010254, "objective/rlhf_reward": -13.422482692931574, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.070642948150635, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6884047985076904, "step": 1257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0150375366210938 }, { "episode": 20144, "epoch": 0.3620807419923069, "loss/policy_avg": -0.01711561344563961, "lr": 9.196063394683027e-06, "objective/entropy": -358.15216064453125, "objective/kl": 35.46274185180664, "objective/non_score_reward": -3.546274185180664, "objective/rlhf_reward": -11.261377964855406, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.329797744750977, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5069355964660645, "step": 1258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0012502670288086 }, { "episode": 20160, "epoch": 0.3623683359096955, "loss/policy_avg": 2.196005344390869, "lr": 9.195424335378324e-06, "objective/entropy": -251.62913513183594, "objective/kl": 35.38679885864258, "objective/non_score_reward": -3.538680076599121, "objective/rlhf_reward": -12.329891677173684, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 30.892749786376953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5522067546844482, "step": 1259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.998639464378357 }, { "episode": 20176, "epoch": 0.36265592982708417, "loss/policy_avg": 0.9676313400268555, "lr": 9.19478527607362e-06, "objective/entropy": -189.81814575195312, "objective/kl": 39.81970977783203, "objective/non_score_reward": -3.981971025466919, "objective/rlhf_reward": -14.58624916365686, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.527480125427246, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7795034646987915, "step": 1260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9979780912399292 }, { "episode": 20192, "epoch": 0.3629435237444728, "loss/policy_avg": 1.3823682069778442, "lr": 9.194146216768916e-06, "objective/entropy": -271.44024658203125, "objective/kl": 34.960262298583984, "objective/non_score_reward": -3.496026039123535, "objective/rlhf_reward": -12.605502464858393, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 38.07636260986328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6255158185958862, "step": 1261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9972703456878662 }, { "episode": 20208, "epoch": 0.36323111766186145, "loss/policy_avg": 0.5843929648399353, "lr": 9.193507157464213e-06, "objective/entropy": -270.2685546875, "objective/kl": 39.74645233154297, "objective/non_score_reward": -3.9746453762054443, "objective/rlhf_reward": -13.4985812664032, "objective/scores": 0.6, "policy/approxkl_avg": 23.453624725341797, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6021866202354431, "step": 1262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000720500946045 }, { "episode": 20224, "epoch": 0.3635187115792501, "loss/policy_avg": 0.5454354882240295, "lr": 9.19286809815951e-06, "objective/entropy": -297.3895263671875, "objective/kl": 28.559341430664062, "objective/non_score_reward": -2.8559341430664062, "objective/rlhf_reward": -9.476325581746037, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 6.098499298095703, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5712930560112, "step": 1263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9995832443237305 }, { "episode": 20240, "epoch": 0.3638063054966387, "loss/policy_avg": 0.07742331176996231, "lr": 9.192229038854807e-06, "objective/entropy": -366.4019775390625, "objective/kl": 31.39904022216797, "objective/non_score_reward": -3.139904022216797, "objective/rlhf_reward": -11.181014397231442, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.4454469680786133, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4675356149673462, "step": 1264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0007882118225098 }, { "episode": 20256, "epoch": 0.36409389941402737, "loss/policy_avg": -0.4216213822364807, "lr": 9.191589979550103e-06, "objective/entropy": -186.52435302734375, "objective/kl": 40.96942901611328, "objective/non_score_reward": -4.096942901611328, "objective/rlhf_reward": -16.387770891189575, "objective/scores": 0.0, "policy/approxkl_avg": 2.0072810649871826, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5078809261322021, "step": 1265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.002551555633545 }, { "episode": 20272, "epoch": 0.36438149333141606, "loss/policy_avg": 1.1457396745681763, "lr": 9.1909509202454e-06, "objective/entropy": -247.307373046875, "objective/kl": 44.37767028808594, "objective/non_score_reward": -4.437767505645752, "objective/rlhf_reward": -16.146950039927084, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 6.405750274658203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5040192604064941, "step": 1266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9998996257781982 }, { "episode": 20288, "epoch": 0.3646690872488047, "loss/policy_avg": 1.983429193496704, "lr": 9.190311860940695e-06, "objective/entropy": -259.5283203125, "objective/kl": 44.60137939453125, "objective/non_score_reward": -4.460138320922852, "objective/rlhf_reward": -16.515038762122316, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 9.213485717773438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5018559098243713, "step": 1267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9977130889892578 }, { "episode": 20304, "epoch": 0.36495668116619334, "loss/policy_avg": -0.08270694315433502, "lr": 9.189672801635992e-06, "objective/entropy": -207.70277404785156, "objective/kl": 25.425886154174805, "objective/non_score_reward": -2.542588472366333, "objective/rlhf_reward": -8.222942541317876, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 5.772380828857422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6019006967544556, "step": 1268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.00008487701416 }, { "episode": 20320, "epoch": 0.365244275083582, "loss/policy_avg": 0.7048158645629883, "lr": 9.18903374233129e-06, "objective/entropy": -418.521728515625, "objective/kl": 43.9279899597168, "objective/non_score_reward": -4.392799377441406, "objective/rlhf_reward": -15.17119655609131, "objective/scores": 0.6, "policy/approxkl_avg": 29.09109115600586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4880741834640503, "step": 1269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9990557432174683 }, { "episode": 20336, "epoch": 0.3655318690009706, "loss/policy_avg": 0.3311663269996643, "lr": 9.188394683026586e-06, "objective/entropy": -357.50665283203125, "objective/kl": 41.803627014160156, "objective/non_score_reward": -4.180362701416016, "objective/rlhf_reward": -12.321451401710512, "objective/scores": 1.1, "policy/approxkl_avg": 4.1715617179870605, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5469897985458374, "step": 1270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9992175102233887 }, { "episode": 20352, "epoch": 0.36581946291835926, "loss/policy_avg": 3.735193967819214, "lr": 9.187755623721883e-06, "objective/entropy": -384.55303955078125, "objective/kl": 35.176979064941406, "objective/non_score_reward": -3.517697811126709, "objective/rlhf_reward": -14.070791721343994, "objective/scores": 0.0, "policy/approxkl_avg": 1.0305461883544922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5758171677589417, "step": 1271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0077285766601562 }, { "episode": 20368, "epoch": 0.3661070568357479, "loss/policy_avg": 0.06244201213121414, "lr": 9.187116564417178e-06, "objective/entropy": -342.1855773925781, "objective/kl": 31.024255752563477, "objective/non_score_reward": -3.1024258136749268, "objective/rlhf_reward": -10.286997260824714, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 5.9018354415893555, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.580519437789917, "step": 1272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9982578754425049 }, { "episode": 20384, "epoch": 0.3663946507531366, "loss/policy_avg": 6.9979753494262695, "lr": 9.186477505112475e-06, "objective/entropy": -217.67437744140625, "objective/kl": 38.800628662109375, "objective/non_score_reward": -3.8800628185272217, "objective/rlhf_reward": -12.596531902195188, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.030364036560059, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6612330675125122, "step": 1273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0009727478027344 }, { "episode": 20400, "epoch": 0.36668224467052524, "loss/policy_avg": 0.8592139482498169, "lr": 9.185838445807772e-06, "objective/entropy": -234.4580841064453, "objective/kl": 37.00343322753906, "objective/non_score_reward": -3.700343370437622, "objective/rlhf_reward": -13.320420625622631, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 4.023309707641602, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6374994516372681, "step": 1274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9995553493499756 }, { "episode": 20416, "epoch": 0.3669698385879139, "loss/policy_avg": 1.459716796875, "lr": 9.185199386503069e-06, "objective/entropy": -113.45836639404297, "objective/kl": 49.175601959228516, "objective/non_score_reward": -4.917560577392578, "objective/rlhf_reward": -16.746522818447325, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.0052738189697266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6882282495498657, "step": 1275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0023000240325928 }, { "episode": 20432, "epoch": 0.3672574325053025, "loss/policy_avg": 0.5852086544036865, "lr": 9.184560327198366e-06, "objective/entropy": -413.2344970703125, "objective/kl": 31.32343292236328, "objective/non_score_reward": -3.132343292236328, "objective/rlhf_reward": -10.40666693665174, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 24.822080612182617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6719192266464233, "step": 1276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9999182224273682 }, { "episode": 20448, "epoch": 0.36754502642269116, "loss/policy_avg": 0.616686224937439, "lr": 9.183921267893663e-06, "objective/entropy": -393.9350280761719, "objective/kl": 24.747295379638672, "objective/non_score_reward": -2.474729537963867, "objective/rlhf_reward": -8.520315268126827, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 4.314851760864258, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.679079532623291, "step": 1277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9996651411056519 }, { "episode": 20464, "epoch": 0.3678326203400798, "loss/policy_avg": 0.3838120400905609, "lr": 9.183282208588958e-06, "objective/entropy": -162.23703002929688, "objective/kl": 50.82600402832031, "objective/non_score_reward": -5.0826005935668945, "objective/rlhf_reward": -18.505573745044778, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 39.946083068847656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4879448413848877, "step": 1278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9972350597381592 }, { "episode": 20480, "epoch": 0.36812021425746844, "loss/policy_avg": 1.2835896015167236, "lr": 9.182643149284255e-06, "objective/entropy": -198.0159912109375, "objective/kl": 50.327056884765625, "objective/non_score_reward": -5.032705783843994, "objective/rlhf_reward": -18.789187481909423, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 14.550447463989258, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.6132803559303284, "step": 1279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.001708984375 }, { "episode": 20496, "epoch": 0.3684078081748571, "loss/policy_avg": 0.3040536344051361, "lr": 9.18200408997955e-06, "objective/entropy": -433.2912292480469, "objective/kl": 40.15401840209961, "objective/non_score_reward": -4.015401840209961, "objective/rlhf_reward": -13.137888704181883, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.5323989391326904, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5225884318351746, "step": 1280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0002591609954834 }, { "episode": 20512, "epoch": 0.36869540209224577, "loss/policy_avg": -0.04445245862007141, "lr": 9.181365030674847e-06, "objective/entropy": -440.21392822265625, "objective/kl": 24.000215530395508, "objective/non_score_reward": -2.4000213146209717, "objective/rlhf_reward": -8.043826430049494, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.4486453533172607, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6184090971946716, "step": 1281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001389980316162 }, { "episode": 20528, "epoch": 0.3689829960096344, "loss/policy_avg": -0.36460408568382263, "lr": 9.180725971370144e-06, "objective/entropy": -270.186279296875, "objective/kl": 32.68629455566406, "objective/non_score_reward": -3.268629789352417, "objective/rlhf_reward": -11.470399413172322, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 11.032265663146973, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5668933391571045, "step": 1282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0010056495666504 }, { "episode": 20544, "epoch": 0.36927058992702305, "loss/policy_avg": -0.20742318034172058, "lr": 9.18008691206544e-06, "objective/entropy": -244.41073608398438, "objective/kl": 36.390419006347656, "objective/non_score_reward": -3.6390419006347656, "objective/rlhf_reward": -12.95204750067385, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.4826407432556152, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7245742082595825, "step": 1283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0020291805267334 }, { "episode": 20560, "epoch": 0.3695581838444117, "loss/policy_avg": 0.14718091487884521, "lr": 9.179447852760737e-06, "objective/entropy": -383.0006408691406, "objective/kl": 26.639236450195312, "objective/non_score_reward": -2.663923740386963, "objective/rlhf_reward": -8.5329884908357, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.2779308557510376, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8083609342575073, "step": 1284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0004258155822754 }, { "episode": 20576, "epoch": 0.36984577776180033, "loss/policy_avg": 0.3269846439361572, "lr": 9.178808793456033e-06, "objective/entropy": -205.06288146972656, "objective/kl": 38.10704040527344, "objective/non_score_reward": -3.810704231262207, "objective/rlhf_reward": -10.842816925048828, "objective/scores": 1.1, "policy/approxkl_avg": 1.217939853668213, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.4525749087333679, "step": 1285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0008418560028076 }, { "episode": 20592, "epoch": 0.37013337167918897, "loss/policy_avg": 2.2391862869262695, "lr": 9.17816973415133e-06, "objective/entropy": -326.0638427734375, "objective/kl": 27.3938045501709, "objective/non_score_reward": -2.739380359649658, "objective/rlhf_reward": -9.224188582102457, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 63.29591751098633, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5218819379806519, "step": 1286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000434637069702 }, { "episode": 20608, "epoch": 0.3704209655965776, "loss/policy_avg": -0.38558429479599, "lr": 9.177530674846626e-06, "objective/entropy": -407.41094970703125, "objective/kl": 31.058378219604492, "objective/non_score_reward": -3.105837821960449, "objective/rlhf_reward": -10.300645055548223, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 136.85385131835938, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6211172342300415, "step": 1287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.003993511199951 }, { "episode": 20624, "epoch": 0.3707085595139663, "loss/policy_avg": -0.1666550636291504, "lr": 9.176891615541923e-06, "objective/entropy": -369.4403381347656, "objective/kl": 33.419124603271484, "objective/non_score_reward": -3.3419125080108643, "objective/rlhf_reward": -12.04213717940442, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.5127824544906616, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.48260343074798584, "step": 1288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999541997909546 }, { "episode": 20640, "epoch": 0.37099615343135495, "loss/policy_avg": 1.5224254131317139, "lr": 9.17625255623722e-06, "objective/entropy": -334.0870361328125, "objective/kl": 38.949466705322266, "objective/non_score_reward": -3.894946813583374, "objective/rlhf_reward": -12.656068716884825, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 17.697521209716797, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.60801762342453, "step": 1289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9981694221496582 }, { "episode": 20656, "epoch": 0.3712837473487436, "loss/policy_avg": 0.6666955351829529, "lr": 9.175613496932517e-06, "objective/entropy": -287.50653076171875, "objective/kl": 38.58717346191406, "objective/non_score_reward": -3.858717203140259, "objective/rlhf_reward": -13.312162282244238, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 37.2802848815918, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6310940980911255, "step": 1290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9984655380249023 }, { "episode": 20672, "epoch": 0.3715713412661322, "loss/policy_avg": 0.39973124861717224, "lr": 9.174974437627812e-06, "objective/entropy": -344.66583251953125, "objective/kl": 33.819435119628906, "objective/non_score_reward": -3.381943464279175, "objective/rlhf_reward": -12.046821239407421, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 12.951045036315918, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6082382202148438, "step": 1291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000288486480713 }, { "episode": 20688, "epoch": 0.37185893518352087, "loss/policy_avg": 0.24316899478435516, "lr": 9.174335378323109e-06, "objective/entropy": -294.3053283691406, "objective/kl": 48.408451080322266, "objective/non_score_reward": -4.840845108032227, "objective/rlhf_reward": -17.63004733721415, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 55.19905090332031, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.614748477935791, "step": 1292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9976427555084229 }, { "episode": 20704, "epoch": 0.3721465291009095, "loss/policy_avg": 0.5328439474105835, "lr": 9.173696319018406e-06, "objective/entropy": -247.26170349121094, "objective/kl": 36.72327423095703, "objective/non_score_reward": -3.672327995300293, "objective/rlhf_reward": -13.02745175880252, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.195164442062378, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7542746663093567, "step": 1293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9984173774719238 }, { "episode": 20720, "epoch": 0.37243412301829815, "loss/policy_avg": 0.645179033279419, "lr": 9.173057259713703e-06, "objective/entropy": -189.98753356933594, "objective/kl": 40.282005310058594, "objective/non_score_reward": -4.028201103210449, "objective/rlhf_reward": -14.753553592894953, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 5.098132133483887, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5262404084205627, "step": 1294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9995384216308594 }, { "episode": 20736, "epoch": 0.3727217169356868, "loss/policy_avg": 0.11185142397880554, "lr": 9.172418200409e-06, "objective/entropy": -258.7457275390625, "objective/kl": 43.49762725830078, "objective/non_score_reward": -4.349762916564941, "objective/rlhf_reward": -15.883279883655245, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 10.509754180908203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6840099692344666, "step": 1295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000523567199707 }, { "episode": 20752, "epoch": 0.3730093108530755, "loss/policy_avg": 1.094523310661316, "lr": 9.171779141104295e-06, "objective/entropy": -293.841796875, "objective/kl": 37.596309661865234, "objective/non_score_reward": -3.7596311569213867, "objective/rlhf_reward": -13.614692647655573, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 44.498512268066406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7216756343841553, "step": 1296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000234603881836 }, { "episode": 20768, "epoch": 0.3732969047704641, "loss/policy_avg": -0.08476521074771881, "lr": 9.171140081799592e-06, "objective/entropy": -317.43218994140625, "objective/kl": 35.878196716308594, "objective/non_score_reward": -3.587820053100586, "objective/rlhf_reward": -12.951279735565187, "objective/scores": 0.35, "policy/approxkl_avg": 8.483059883117676, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6061519384384155, "step": 1297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0002965927124023 }, { "episode": 20784, "epoch": 0.37358449868785276, "loss/policy_avg": 0.41005057096481323, "lr": 9.170501022494889e-06, "objective/entropy": -340.95574951171875, "objective/kl": 51.851715087890625, "objective/non_score_reward": -5.185171604156494, "objective/rlhf_reward": -19.00735403696696, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 87.13654327392578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6143768429756165, "step": 1298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9994208812713623 }, { "episode": 20800, "epoch": 0.3738720926052414, "loss/policy_avg": 0.2793387770652771, "lr": 9.169861963190185e-06, "objective/entropy": -318.2176513671875, "objective/kl": 40.0487174987793, "objective/non_score_reward": -4.004871845245361, "objective/rlhf_reward": -14.357627158582794, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 7.048634052276611, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.650529682636261, "step": 1299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.9998862743377686 }, { "episode": 20816, "epoch": 0.37415968652263004, "loss/policy_avg": 0.1553981602191925, "lr": 9.169222903885482e-06, "objective/entropy": -269.3183898925781, "objective/kl": 43.71953582763672, "objective/non_score_reward": -4.371953964233398, "objective/rlhf_reward": -16.087816333770753, "objective/scores": 0.35, "policy/approxkl_avg": 97.4100570678711, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7116272449493408, "step": 1300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9980614185333252 }, { "episode": 20832, "epoch": 0.3744472804400187, "loss/policy_avg": -0.7957895994186401, "lr": 9.168583844580777e-06, "objective/entropy": -384.84564208984375, "objective/kl": 36.3896369934082, "objective/non_score_reward": -3.6389639377593994, "objective/rlhf_reward": -13.155856227874757, "objective/scores": 0.35, "policy/approxkl_avg": 21.481582641601562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5471504926681519, "step": 1301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001662254333496 }, { "episode": 20848, "epoch": 0.3747348743574073, "loss/policy_avg": 0.13812817633152008, "lr": 9.167944785276074e-06, "objective/entropy": -332.6588134765625, "objective/kl": 44.57892608642578, "objective/non_score_reward": -4.457892417907715, "objective/rlhf_reward": -16.43156991004944, "objective/scores": 0.35, "policy/approxkl_avg": 11.337006568908691, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7413872480392456, "step": 1302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9996337890625 }, { "episode": 20864, "epoch": 0.37502246827479596, "loss/policy_avg": -0.6085488200187683, "lr": 9.167305725971371e-06, "objective/entropy": -177.65972900390625, "objective/kl": 30.891742706298828, "objective/non_score_reward": -3.089174747467041, "objective/rlhf_reward": -10.752578291956503, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.961477756500244, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5784578323364258, "step": 1303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.008862257003784 }, { "episode": 20880, "epoch": 0.37531006219218466, "loss/policy_avg": 0.6260799765586853, "lr": 9.166666666666666e-06, "objective/entropy": -402.1231689453125, "objective/kl": 26.68427276611328, "objective/non_score_reward": -2.6684274673461914, "objective/rlhf_reward": -9.348196539908571, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 10.638071060180664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6720375418663025, "step": 1304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0005922317504883 }, { "episode": 20896, "epoch": 0.3755976561095733, "loss/policy_avg": -0.11920569092035294, "lr": 9.166027607361963e-06, "objective/entropy": -257.68963623046875, "objective/kl": 45.14056396484375, "objective/non_score_reward": -4.514056205749512, "objective/rlhf_reward": -16.730713162451906, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 13.453465461730957, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6986085176467896, "step": 1305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999328374862671 }, { "episode": 20912, "epoch": 0.37588525002696194, "loss/policy_avg": -0.08390214294195175, "lr": 9.16538854805726e-06, "objective/entropy": -222.66043090820312, "objective/kl": 49.427791595458984, "objective/non_score_reward": -4.942779064178467, "objective/rlhf_reward": -18.10925722640811, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.095534563064575, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.40725716948509216, "step": 1306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0003769397735596 }, { "episode": 20928, "epoch": 0.3761728439443506, "loss/policy_avg": 0.5120038986206055, "lr": 9.164749488752557e-06, "objective/entropy": -372.8121337890625, "objective/kl": 32.1931266784668, "objective/non_score_reward": -3.2193126678466797, "objective/rlhf_reward": -11.535615852385192, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 21.06869125366211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6987943053245544, "step": 1307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9988491535186768 }, { "episode": 20944, "epoch": 0.3764604378617392, "loss/policy_avg": -0.5875188708305359, "lr": 9.164110429447854e-06, "objective/entropy": -265.3511047363281, "objective/kl": 46.455650329589844, "objective/non_score_reward": -4.645565032958984, "objective/rlhf_reward": -15.658542071224424, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 24.570552825927734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7431293725967407, "step": 1308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9992225170135498 }, { "episode": 20960, "epoch": 0.37674803177912786, "loss/policy_avg": -0.13081759214401245, "lr": 9.163471370143149e-06, "objective/entropy": -435.9374084472656, "objective/kl": 42.12052917480469, "objective/non_score_reward": -4.212053298950195, "objective/rlhf_reward": -15.332440936359102, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 11.497260093688965, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7462241649627686, "step": 1309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0012993812561035 }, { "episode": 20976, "epoch": 0.3770356256965165, "loss/policy_avg": 0.9687138199806213, "lr": 9.162832310838446e-06, "objective/entropy": -300.26806640625, "objective/kl": 38.1783561706543, "objective/non_score_reward": -3.817835569381714, "objective/rlhf_reward": -15.271342277526855, "objective/scores": 0.0, "policy/approxkl_avg": 20.857513427734375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7775424718856812, "step": 1310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9987459182739258 }, { "episode": 20992, "epoch": 0.3773232196139052, "loss/policy_avg": -0.2119673490524292, "lr": 9.162193251533743e-06, "objective/entropy": -420.786865234375, "objective/kl": 32.11229705810547, "objective/non_score_reward": -3.2112293243408203, "objective/rlhf_reward": -11.42108495970544, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.054572582244873, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6162212491035461, "step": 1311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0023930072784424 }, { "episode": 21008, "epoch": 0.37761081353129383, "loss/policy_avg": 0.8040826320648193, "lr": 9.16155419222904e-06, "objective/entropy": -348.49676513671875, "objective/kl": 23.683992385864258, "objective/non_score_reward": -2.368399143218994, "objective/rlhf_reward": -8.14808360102765, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 211.59429931640625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4950878620147705, "step": 1312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9991800785064697 }, { "episode": 21024, "epoch": 0.37789840744868247, "loss/policy_avg": 0.3733624815940857, "lr": 9.160915132924337e-06, "objective/entropy": -368.998046875, "objective/kl": 35.06987762451172, "objective/non_score_reward": -3.5069878101348877, "objective/rlhf_reward": -12.423831496302206, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.066882133483887, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5737513303756714, "step": 1313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9968420267105103 }, { "episode": 21040, "epoch": 0.3781860013660711, "loss/policy_avg": 1.1639472246170044, "lr": 9.160276073619634e-06, "objective/entropy": -364.18914794921875, "objective/kl": 25.151416778564453, "objective/non_score_reward": -2.515141487121582, "objective/rlhf_reward": -8.660566544532776, "objective/scores": 0.35, "policy/approxkl_avg": 44.53583526611328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6184509992599487, "step": 1314, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.999891757965088 }, { "episode": 21056, "epoch": 0.37847359528345975, "loss/policy_avg": 1.6804909706115723, "lr": 9.159637014314929e-06, "objective/entropy": -274.9586181640625, "objective/kl": 38.709739685058594, "objective/non_score_reward": -3.870974063873291, "objective/rlhf_reward": -13.75056292215983, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 89.40251159667969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7033193707466125, "step": 1315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9986300468444824 }, { "episode": 21072, "epoch": 0.3787611892008484, "loss/policy_avg": 0.8930248022079468, "lr": 9.158997955010226e-06, "objective/entropy": -373.98681640625, "objective/kl": 20.77981948852539, "objective/non_score_reward": -2.077981948852539, "objective/rlhf_reward": -6.796156251224216, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 11.34402847290039, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6029479503631592, "step": 1316, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9983603954315186 }, { "episode": 21088, "epoch": 0.37904878311823703, "loss/policy_avg": -0.026193473488092422, "lr": 9.158358895705522e-06, "objective/entropy": -388.14215087890625, "objective/kl": 27.69371795654297, "objective/non_score_reward": -2.76937198638916, "objective/rlhf_reward": -9.415628080785858, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.2698187828063965, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59114670753479, "step": 1317, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9991490840911865 }, { "episode": 21104, "epoch": 0.37933637703562567, "loss/policy_avg": 0.17368023097515106, "lr": 9.15771983640082e-06, "objective/entropy": -317.0810546875, "objective/kl": 35.92310333251953, "objective/non_score_reward": -3.5923104286193848, "objective/rlhf_reward": -9.969241118431093, "objective/scores": 1.1, "policy/approxkl_avg": 26.020530700683594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7205820083618164, "step": 1318, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9991936683654785 }, { "episode": 21120, "epoch": 0.37962397095301437, "loss/policy_avg": 0.5201810002326965, "lr": 9.157080777096116e-06, "objective/entropy": -314.649658203125, "objective/kl": 46.13852310180664, "objective/non_score_reward": -4.613852500915527, "objective/rlhf_reward": -14.055408573150636, "objective/scores": 1.1, "policy/approxkl_avg": 21.224571228027344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7175954580307007, "step": 1319, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9987601041793823 }, { "episode": 21136, "epoch": 0.379911564870403, "loss/policy_avg": -0.6473659873008728, "lr": 9.156441717791411e-06, "objective/entropy": -340.4374694824219, "objective/kl": 44.676971435546875, "objective/non_score_reward": -4.4676971435546875, "objective/rlhf_reward": -16.42018948039566, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.270391464233398, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6959241628646851, "step": 1320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0026345252990723 }, { "episode": 21152, "epoch": 0.38019915878779165, "loss/policy_avg": -0.7370725870132446, "lr": 9.155802658486708e-06, "objective/entropy": -224.52786254882812, "objective/kl": 60.82535934448242, "objective/non_score_reward": -6.082535743713379, "objective/rlhf_reward": -22.930143213272096, "objective/scores": 0.35, "policy/approxkl_avg": 16.976163864135742, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6182113885879517, "step": 1321, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0010156631469727 }, { "episode": 21168, "epoch": 0.3804867527051803, "loss/policy_avg": 1.0462374687194824, "lr": 9.155163599182005e-06, "objective/entropy": -332.122314453125, "objective/kl": 53.076698303222656, "objective/non_score_reward": -5.3076701164245605, "objective/rlhf_reward": -19.28326852150434, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 14.435585975646973, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6350855827331543, "step": 1322, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990880489349365 }, { "episode": 21184, "epoch": 0.3807743466225689, "loss/policy_avg": 1.7183446884155273, "lr": 9.1545245398773e-06, "objective/entropy": -368.0193786621094, "objective/kl": 39.85797119140625, "objective/non_score_reward": -3.98579740524292, "objective/rlhf_reward": -14.427417838367159, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 11.685422897338867, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6701849102973938, "step": 1323, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9985411167144775 }, { "episode": 21200, "epoch": 0.38106194053995757, "loss/policy_avg": 0.5340535640716553, "lr": 9.153885480572597e-06, "objective/entropy": -263.06024169921875, "objective/kl": 37.750396728515625, "objective/non_score_reward": -3.7750401496887207, "objective/rlhf_reward": -13.75852470686975, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.002713680267334, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7606070041656494, "step": 1324, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.000021457672119 }, { "episode": 21216, "epoch": 0.3813495344573462, "loss/policy_avg": 0.060468271374702454, "lr": 9.153246421267894e-06, "objective/entropy": -325.2740783691406, "objective/kl": 36.90810775756836, "objective/non_score_reward": -3.6908109188079834, "objective/rlhf_reward": -13.206984846797539, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 18.143375396728516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6480287909507751, "step": 1325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9979503154754639 }, { "episode": 21232, "epoch": 0.3816371283747349, "loss/policy_avg": 0.00686752051115036, "lr": 9.152607361963191e-06, "objective/entropy": -379.060546875, "objective/kl": 37.299293518066406, "objective/non_score_reward": -3.7299294471740723, "objective/rlhf_reward": -13.469119171710357, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.0818734169006348, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7073227763175964, "step": 1326, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000476360321045 }, { "episode": 21248, "epoch": 0.38192472229212354, "loss/policy_avg": 0.34372997283935547, "lr": 9.151968302658488e-06, "objective/entropy": -301.8936767578125, "objective/kl": 37.7095832824707, "objective/non_score_reward": -3.770958423614502, "objective/rlhf_reward": -13.350500599543253, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 12.18560791015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8126070499420166, "step": 1327, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.998203992843628 }, { "episode": 21264, "epoch": 0.3822123162095122, "loss/policy_avg": 0.9416133761405945, "lr": 9.151329243353783e-06, "objective/entropy": -323.7284851074219, "objective/kl": 40.846614837646484, "objective/non_score_reward": -4.084661483764648, "objective/rlhf_reward": -14.391234706120429, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 7.648550987243652, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5437846183776855, "step": 1328, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9982171058654785 }, { "episode": 21280, "epoch": 0.3824999101269008, "loss/policy_avg": 0.8060399293899536, "lr": 9.15069018404908e-06, "objective/entropy": -168.87033081054688, "objective/kl": 34.03877639770508, "objective/non_score_reward": -3.4038777351379395, "objective/rlhf_reward": -12.2899983263313, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 39.01153564453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7910028696060181, "step": 1329, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9972712993621826 }, { "episode": 21296, "epoch": 0.38278750404428946, "loss/policy_avg": 1.3905370235443115, "lr": 9.150051124744377e-06, "objective/entropy": -374.4802551269531, "objective/kl": 35.899139404296875, "objective/non_score_reward": -3.589913845062256, "objective/rlhf_reward": -12.697796588361847, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 8.751789093017578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5448976755142212, "step": 1330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9975414276123047 }, { "episode": 21312, "epoch": 0.3830750979616781, "loss/policy_avg": 0.2682191729545593, "lr": 9.149412065439674e-06, "objective/entropy": -418.6098937988281, "objective/kl": 34.77262878417969, "objective/non_score_reward": -3.4772627353668213, "objective/rlhf_reward": -12.56741552641931, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 25.400672912597656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6386086940765381, "step": 1331, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9991669654846191 }, { "episode": 21328, "epoch": 0.38336269187906674, "loss/policy_avg": 0.2507719397544861, "lr": 9.14877300613497e-06, "objective/entropy": -388.8086242675781, "objective/kl": 25.366113662719727, "objective/non_score_reward": -2.5366110801696777, "objective/rlhf_reward": -8.722612936695185, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.4400053024291992, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5974361896514893, "step": 1332, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.001107692718506 }, { "episode": 21344, "epoch": 0.3836502857964554, "loss/policy_avg": 2.600416660308838, "lr": 9.148133946830266e-06, "objective/entropy": -396.5406494140625, "objective/kl": 30.072546005249023, "objective/non_score_reward": -3.0072546005249023, "objective/rlhf_reward": -10.204189892086099, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.473404884338379, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7935835123062134, "step": 1333, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.007052421569824 }, { "episode": 21360, "epoch": 0.3839378797138441, "loss/policy_avg": 1.3868666887283325, "lr": 9.147494887525563e-06, "objective/entropy": -175.97232055664062, "objective/kl": 40.14502716064453, "objective/non_score_reward": -4.01450252532959, "objective/rlhf_reward": -14.501751749721123, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 7.633020401000977, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6726785898208618, "step": 1334, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0072879791259766 }, { "episode": 21376, "epoch": 0.3842254736312327, "loss/policy_avg": 0.20190608501434326, "lr": 9.14685582822086e-06, "objective/entropy": -335.6683654785156, "objective/kl": 35.63044357299805, "objective/non_score_reward": -3.563044786453247, "objective/rlhf_reward": -12.42735015896232, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 15.596217155456543, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.569972038269043, "step": 1335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9968949556350708 }, { "episode": 21392, "epoch": 0.38451306754862136, "loss/policy_avg": 0.6353225708007812, "lr": 9.146216768916156e-06, "objective/entropy": -431.4044189453125, "objective/kl": 31.53327178955078, "objective/non_score_reward": -3.153327465057373, "objective/rlhf_reward": -11.132356527264475, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.473521709442139, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6514220237731934, "step": 1336, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9975464344024658 }, { "episode": 21408, "epoch": 0.38480066146601, "loss/policy_avg": 0.6485949158668518, "lr": 9.145577709611453e-06, "objective/entropy": -356.7212829589844, "objective/kl": 28.699506759643555, "objective/non_score_reward": -2.869950771331787, "objective/rlhf_reward": -9.532391856388982, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 18.34389877319336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6353943347930908, "step": 1337, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9984397888183594 }, { "episode": 21424, "epoch": 0.38508825538339864, "loss/policy_avg": 0.1686447262763977, "lr": 9.14493865030675e-06, "objective/entropy": -323.6803283691406, "objective/kl": 41.16382598876953, "objective/non_score_reward": -4.116382598876953, "objective/rlhf_reward": -15.14001658919446, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 9.843915939331055, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.71875, "step": 1338, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990746974945068 }, { "episode": 21440, "epoch": 0.3853758493007873, "loss/policy_avg": 0.3248051106929779, "lr": 9.144299591002045e-06, "objective/entropy": -177.802978515625, "objective/kl": 33.11115646362305, "objective/non_score_reward": -3.3111162185668945, "objective/rlhf_reward": -11.419636125835488, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 33.397918701171875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.43310868740081787, "step": 1339, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9984533786773682 }, { "episode": 21456, "epoch": 0.3856634432181759, "loss/policy_avg": 0.6859661340713501, "lr": 9.143660531697342e-06, "objective/entropy": -380.26947021484375, "objective/kl": 43.30434036254883, "objective/non_score_reward": -4.3304338455200195, "objective/rlhf_reward": -15.89790447493371, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.699972152709961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5974143743515015, "step": 1340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.998788833618164 }, { "episode": 21472, "epoch": 0.38595103713556456, "loss/policy_avg": -0.22041243314743042, "lr": 9.143021472392639e-06, "objective/entropy": -321.98541259765625, "objective/kl": 37.492950439453125, "objective/non_score_reward": -3.7492949962615967, "objective/rlhf_reward": -12.597180223464967, "objective/scores": 0.6, "policy/approxkl_avg": 1.618495225906372, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6155891418457031, "step": 1341, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000842571258545 }, { "episode": 21488, "epoch": 0.38623863105295325, "loss/policy_avg": 0.010817509144544601, "lr": 9.142382413087936e-06, "objective/entropy": -364.04376220703125, "objective/kl": 42.580322265625, "objective/non_score_reward": -4.258032321929932, "objective/rlhf_reward": -15.428009305063803, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 0.8514620661735535, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.700284481048584, "step": 1342, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.000159978866577 }, { "episode": 21504, "epoch": 0.3865262249703419, "loss/policy_avg": 0.4330994784832001, "lr": 9.141743353783233e-06, "objective/entropy": -339.93463134765625, "objective/kl": 27.60832977294922, "objective/non_score_reward": -2.760833263397217, "objective/rlhf_reward": -9.664730169860226, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 20.357982635498047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5951058268547058, "step": 1343, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0009031295776367 }, { "episode": 21520, "epoch": 0.38681381888773053, "loss/policy_avg": -0.8922510743141174, "lr": 9.14110429447853e-06, "objective/entropy": -188.47885131835938, "objective/kl": 48.26534652709961, "objective/non_score_reward": -4.826534748077393, "objective/rlhf_reward": -16.90613899230957, "objective/scores": 0.6, "policy/approxkl_avg": 2.025063991546631, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7905806303024292, "step": 1344, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0016136169433594 }, { "episode": 21536, "epoch": 0.38710141280511917, "loss/policy_avg": 0.07814952731132507, "lr": 9.140465235173825e-06, "objective/entropy": -224.74832153320312, "objective/kl": 49.67034912109375, "objective/non_score_reward": -4.967035293579102, "objective/rlhf_reward": -19.868140697479248, "objective/scores": 0.0, "policy/approxkl_avg": 93.35905456542969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.78722083568573, "step": 1345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9987597465515137 }, { "episode": 21552, "epoch": 0.3873890067225078, "loss/policy_avg": 0.11463617533445358, "lr": 9.13982617586912e-06, "objective/entropy": -369.23138427734375, "objective/kl": 26.932374954223633, "objective/non_score_reward": -2.6932373046875, "objective/rlhf_reward": -9.168829474512654, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 18.15654182434082, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7057843208312988, "step": 1346, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9992265701293945 }, { "episode": 21568, "epoch": 0.38767660063989645, "loss/policy_avg": 0.01413014531135559, "lr": 9.139187116564417e-06, "objective/entropy": -32.13642120361328, "objective/kl": 60.65349578857422, "objective/non_score_reward": -6.065349102020264, "objective/rlhf_reward": -22.919761708288817, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.7689263820648193, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6423827409744263, "step": 1347, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994776248931885 }, { "episode": 21584, "epoch": 0.3879641945572851, "loss/policy_avg": 1.2476742267608643, "lr": 9.138548057259714e-06, "objective/entropy": -368.5486755371094, "objective/kl": 40.76191711425781, "objective/non_score_reward": -4.0761919021606445, "objective/rlhf_reward": -14.963131716757445, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 44.87795639038086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7054693698883057, "step": 1348, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9980682134628296 }, { "episode": 21600, "epoch": 0.3882517884746738, "loss/policy_avg": -0.21976836025714874, "lr": 9.13790899795501e-06, "objective/entropy": -381.8218994140625, "objective/kl": 31.798236846923828, "objective/non_score_reward": -3.179823637008667, "objective/rlhf_reward": -10.319294786453247, "objective/scores": 0.6, "policy/approxkl_avg": 15.000465393066406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6498135328292847, "step": 1349, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998157024383545 }, { "episode": 21616, "epoch": 0.3885393823920624, "loss/policy_avg": 0.03217875957489014, "lr": 9.137269938650308e-06, "objective/entropy": -390.0887451171875, "objective/kl": 25.918424606323242, "objective/non_score_reward": -2.5918426513671875, "objective/rlhf_reward": -8.851598107608494, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 10.428825378417969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6576427221298218, "step": 1350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.001335620880127 }, { "episode": 21632, "epoch": 0.38882697630945107, "loss/policy_avg": 1.1896159648895264, "lr": 9.136630879345604e-06, "objective/entropy": -400.5634460449219, "objective/kl": 32.69147491455078, "objective/non_score_reward": -3.2691473960876465, "objective/rlhf_reward": -11.652757961948481, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 63.98255920410156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5532021522521973, "step": 1351, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001161575317383 }, { "episode": 21648, "epoch": 0.3891145702268397, "loss/policy_avg": 0.9694092273712158, "lr": 9.1359918200409e-06, "objective/entropy": -164.1553955078125, "objective/kl": 39.461395263671875, "objective/non_score_reward": -3.946139335632324, "objective/rlhf_reward": -14.333959738822326, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 29.93909454345703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9108109474182129, "step": 1352, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998304843902588 }, { "episode": 21664, "epoch": 0.38940216414422835, "loss/policy_avg": 0.6806215047836304, "lr": 9.135352760736197e-06, "objective/entropy": -391.24395751953125, "objective/kl": 32.63773727416992, "objective/non_score_reward": -3.2637739181518555, "objective/rlhf_reward": -11.631262619693842, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 16.37675666809082, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7227077484130859, "step": 1353, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9966504573822021 }, { "episode": 21680, "epoch": 0.389689758061617, "loss/policy_avg": 0.6310924887657166, "lr": 9.134713701431493e-06, "objective/entropy": -350.49237060546875, "objective/kl": 40.53452682495117, "objective/non_score_reward": -4.053452968597412, "objective/rlhf_reward": -14.872175505667357, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.683610677719116, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6865704655647278, "step": 1354, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.001756429672241 }, { "episode": 21696, "epoch": 0.3899773519790056, "loss/policy_avg": 0.9344829320907593, "lr": 9.13407464212679e-06, "objective/entropy": -280.8444519042969, "objective/kl": 45.98351287841797, "objective/non_score_reward": -4.59835147857666, "objective/rlhf_reward": -16.877632939609224, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 15.602426528930664, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7727674245834351, "step": 1355, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.998740553855896 }, { "episode": 21712, "epoch": 0.39026494589639427, "loss/policy_avg": 0.594275712966919, "lr": 9.133435582822087e-06, "objective/entropy": -339.8444519042969, "objective/kl": 37.521331787109375, "objective/non_score_reward": -3.7521331310272217, "objective/rlhf_reward": -13.183703775676797, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 8.196802139282227, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6502610445022583, "step": 1356, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998910665512085 }, { "episode": 21728, "epoch": 0.39055253981378296, "loss/policy_avg": 0.129550039768219, "lr": 9.132796523517384e-06, "objective/entropy": -325.10711669921875, "objective/kl": 38.26902770996094, "objective/non_score_reward": -3.8269031047821045, "objective/rlhf_reward": -13.751352637019707, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.606585502624512, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8427555561065674, "step": 1357, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9986683130264282 }, { "episode": 21744, "epoch": 0.3908401337311716, "loss/policy_avg": 1.861234188079834, "lr": 9.13215746421268e-06, "objective/entropy": -463.657470703125, "objective/kl": 31.77120590209961, "objective/non_score_reward": -3.1771209239959717, "objective/rlhf_reward": -11.329881050673823, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.8403820991516113, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6461368203163147, "step": 1358, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9993607997894287 }, { "episode": 21760, "epoch": 0.39112772764856024, "loss/policy_avg": -0.2527350187301636, "lr": 9.131518404907976e-06, "objective/entropy": -417.7566223144531, "objective/kl": 32.408119201660156, "objective/non_score_reward": -3.240811824798584, "objective/rlhf_reward": -10.039528046489927, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 0.9179061055183411, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7252938747406006, "step": 1359, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.00144624710083 }, { "episode": 21776, "epoch": 0.3914153215659489, "loss/policy_avg": 0.22104991972446442, "lr": 9.130879345603273e-06, "objective/entropy": -200.26748657226562, "objective/kl": 47.8714485168457, "objective/non_score_reward": -4.787144660949707, "objective/rlhf_reward": -17.80694394400659, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.910266876220703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.753902792930603, "step": 1360, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9992446899414062 }, { "episode": 21792, "epoch": 0.3917029154833375, "loss/policy_avg": 0.043792642652988434, "lr": 9.13024028629857e-06, "objective/entropy": -348.241943359375, "objective/kl": 27.76773452758789, "objective/non_score_reward": -2.776773452758789, "objective/rlhf_reward": -8.707094287872314, "objective/scores": 0.6, "policy/approxkl_avg": 11.209461212158203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7479934692382812, "step": 1361, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9988564252853394 }, { "episode": 21808, "epoch": 0.39199050940072616, "loss/policy_avg": 0.7324270606040955, "lr": 9.129601226993867e-06, "objective/entropy": -189.47340393066406, "objective/kl": 34.40290832519531, "objective/non_score_reward": -3.440291166305542, "objective/rlhf_reward": -12.419528654127745, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.100346088409424, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8453266620635986, "step": 1362, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0005149841308594 }, { "episode": 21824, "epoch": 0.3922781033181148, "loss/policy_avg": 0.48370519280433655, "lr": 9.128962167689162e-06, "objective/entropy": -232.60858154296875, "objective/kl": 47.771888732910156, "objective/non_score_reward": -4.777188777923584, "objective/rlhf_reward": -17.74950524542181, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 32.09171676635742, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.692764163017273, "step": 1363, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0015649795532227 }, { "episode": 21840, "epoch": 0.3925656972355035, "loss/policy_avg": 0.531414806842804, "lr": 9.128323108384459e-06, "objective/entropy": -419.53955078125, "objective/kl": 35.909549713134766, "objective/non_score_reward": -3.5909550189971924, "objective/rlhf_reward": -12.939987976749507, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 22.729013442993164, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6363123059272766, "step": 1364, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9991600513458252 }, { "episode": 21856, "epoch": 0.39285329115289214, "loss/policy_avg": 1.075439214706421, "lr": 9.127684049079756e-06, "objective/entropy": -350.808837890625, "objective/kl": 36.461883544921875, "objective/non_score_reward": -3.646188735961914, "objective/rlhf_reward": -12.922894959867584, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.9342331886291504, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7204353213310242, "step": 1365, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9990465641021729 }, { "episode": 21872, "epoch": 0.3931408850702808, "loss/policy_avg": 0.4153378903865814, "lr": 9.127044989775053e-06, "objective/entropy": -372.37677001953125, "objective/kl": 29.302295684814453, "objective/non_score_reward": -2.930229425430298, "objective/rlhf_reward": -10.23996556084907, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 38.43246078491211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6281751394271851, "step": 1366, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.998981237411499 }, { "episode": 21888, "epoch": 0.3934284789876694, "loss/policy_avg": 0.7153229713439941, "lr": 9.126405930470348e-06, "objective/entropy": -463.0276794433594, "objective/kl": 17.42813491821289, "objective/non_score_reward": -1.7428135871887207, "objective/rlhf_reward": -5.629618695288329, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 28.79767608642578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8106905221939087, "step": 1367, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9995535612106323 }, { "episode": 21904, "epoch": 0.39371607290505806, "loss/policy_avg": 0.0009032562375068665, "lr": 9.125766871165645e-06, "objective/entropy": -379.85406494140625, "objective/kl": 30.87749481201172, "objective/non_score_reward": -3.087749719619751, "objective/rlhf_reward": -9.427279864193174, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.976633548736572, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.821713387966156, "step": 1368, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9988418817520142 }, { "episode": 21920, "epoch": 0.3940036668224467, "loss/policy_avg": 0.07186302542686462, "lr": 9.125127811860942e-06, "objective/entropy": -436.9739074707031, "objective/kl": 29.32456398010254, "objective/non_score_reward": -2.9324564933776855, "objective/rlhf_reward": -9.329825973510742, "objective/scores": 0.6, "policy/approxkl_avg": 19.7957763671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.606777548789978, "step": 1369, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9986364841461182 }, { "episode": 21936, "epoch": 0.39429126073983534, "loss/policy_avg": 0.729326605796814, "lr": 9.124488752556238e-06, "objective/entropy": -389.854736328125, "objective/kl": 28.45232391357422, "objective/non_score_reward": -2.8452324867248535, "objective/rlhf_reward": -9.82467123767431, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 0.9159629344940186, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7564606666564941, "step": 1370, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0026400089263916 }, { "episode": 21952, "epoch": 0.394578854657224, "loss/policy_avg": 0.4578947126865387, "lr": 9.123849693251534e-06, "objective/entropy": -389.033447265625, "objective/kl": 44.31778335571289, "objective/non_score_reward": -4.431778907775879, "objective/rlhf_reward": -14.803395305515501, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.3826184272766113, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6693323850631714, "step": 1371, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9991586208343506 }, { "episode": 21968, "epoch": 0.39486644857461267, "loss/policy_avg": 0.31118065118789673, "lr": 9.12321063394683e-06, "objective/entropy": -322.3407897949219, "objective/kl": 35.505287170410156, "objective/non_score_reward": -3.5505290031433105, "objective/rlhf_reward": -11.278396521450254, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.4548633098602295, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6461632251739502, "step": 1372, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999839425086975 }, { "episode": 21984, "epoch": 0.3951540424920013, "loss/policy_avg": -0.05635763332247734, "lr": 9.122571574642127e-06, "objective/entropy": -410.1376953125, "objective/kl": 35.39653778076172, "objective/non_score_reward": -3.539653778076172, "objective/rlhf_reward": -12.758615112304689, "objective/scores": 0.35, "policy/approxkl_avg": 21.792388916015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5122762322425842, "step": 1373, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9988276958465576 }, { "episode": 22000, "epoch": 0.39544163640938995, "loss/policy_avg": 1.872693657875061, "lr": 9.121932515337424e-06, "objective/entropy": -312.9439697265625, "objective/kl": 35.486778259277344, "objective/non_score_reward": -3.548678159713745, "objective/rlhf_reward": -9.79471299648285, "objective/scores": 1.1, "policy/approxkl_avg": 4.229345798492432, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7551467418670654, "step": 1374, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.000633478164673 }, { "episode": 22016, "epoch": 0.3957292303267786, "loss/policy_avg": 0.6942653059959412, "lr": 9.121293456032721e-06, "objective/entropy": -109.183837890625, "objective/kl": 44.22985076904297, "objective/non_score_reward": -4.422985076904297, "objective/rlhf_reward": -15.291939115524293, "objective/scores": 0.6, "policy/approxkl_avg": 38.4278450012207, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.39765501022338867, "step": 1375, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999322533607483 }, { "episode": 22032, "epoch": 0.39601682424416723, "loss/policy_avg": 0.7759445905685425, "lr": 9.120654396728016e-06, "objective/entropy": -368.8749084472656, "objective/kl": 30.625951766967773, "objective/non_score_reward": -3.0625953674316406, "objective/rlhf_reward": -10.588522439420807, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 27.372760772705078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7324656844139099, "step": 1376, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.998070240020752 }, { "episode": 22048, "epoch": 0.39630441816155587, "loss/policy_avg": 0.6382604837417603, "lr": 9.120015337423313e-06, "objective/entropy": -298.1814880371094, "objective/kl": 48.05633544921875, "objective/non_score_reward": -4.805633544921875, "objective/rlhf_reward": -17.822532987594606, "objective/scores": 0.35, "policy/approxkl_avg": 15.054319381713867, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.749573290348053, "step": 1377, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9986592531204224 }, { "episode": 22064, "epoch": 0.3965920120789445, "loss/policy_avg": 0.053258396685123444, "lr": 9.11937627811861e-06, "objective/entropy": -320.9412841796875, "objective/kl": 31.051342010498047, "objective/non_score_reward": -3.1051342487335205, "objective/rlhf_reward": -11.04193434962402, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.6051864624023438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6188329458236694, "step": 1378, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987077713012695 }, { "episode": 22080, "epoch": 0.39687960599633315, "loss/policy_avg": -0.022227829322218895, "lr": 9.118737218813907e-06, "objective/entropy": -416.11962890625, "objective/kl": 39.872520446777344, "objective/non_score_reward": -3.9872522354125977, "objective/rlhf_reward": -14.468056323941113, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.9590373039245605, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7845929265022278, "step": 1379, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9999374151229858 }, { "episode": 22096, "epoch": 0.39716719991372185, "loss/policy_avg": 2.350285053253174, "lr": 9.118098159509204e-06, "objective/entropy": -348.2054443359375, "objective/kl": 45.75993347167969, "objective/non_score_reward": -4.575993537902832, "objective/rlhf_reward": -16.699853215281088, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 24.84105682373047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5819368362426758, "step": 1380, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9977903366088867 }, { "episode": 22112, "epoch": 0.3974547938311105, "loss/policy_avg": 0.33440935611724854, "lr": 9.1174591002045e-06, "objective/entropy": -365.2570495605469, "objective/kl": 29.346817016601562, "objective/non_score_reward": -2.9346818923950195, "objective/rlhf_reward": -10.379476988051815, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 32.46656799316406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7213993072509766, "step": 1381, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9971586465835571 }, { "episode": 22128, "epoch": 0.3977423877484991, "loss/policy_avg": 0.9947599172592163, "lr": 9.116820040899796e-06, "objective/entropy": -173.59478759765625, "objective/kl": 40.165748596191406, "objective/non_score_reward": -4.016574859619141, "objective/rlhf_reward": -14.61570105990921, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 28.563400268554688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6623480319976807, "step": 1382, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9988077878952026 }, { "episode": 22144, "epoch": 0.39802998166588777, "loss/policy_avg": -0.45232996344566345, "lr": 9.116180981595093e-06, "objective/entropy": -375.18206787109375, "objective/kl": 39.70623016357422, "objective/non_score_reward": -3.970623254776001, "objective/rlhf_reward": -12.958774481655333, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.407848358154297, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.6866327524185181, "step": 1383, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.002528667449951 }, { "episode": 22160, "epoch": 0.3983175755832764, "loss/policy_avg": 0.7820472717285156, "lr": 9.11554192229039e-06, "objective/entropy": -348.3802490234375, "objective/kl": 32.83573913574219, "objective/non_score_reward": -3.283573627471924, "objective/rlhf_reward": -11.755692579833369, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.2820870876312256, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6438629627227783, "step": 1384, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9997235536575317 }, { "episode": 22176, "epoch": 0.39860516950066505, "loss/policy_avg": 0.6776278018951416, "lr": 9.114902862985686e-06, "objective/entropy": -288.54461669921875, "objective/kl": 46.94630432128906, "objective/non_score_reward": -4.694630146026611, "objective/rlhf_reward": -16.831110308842597, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.1883745193481445, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6386712789535522, "step": 1385, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0009355545043945 }, { "episode": 22192, "epoch": 0.3988927634180537, "loss/policy_avg": 1.8577520847320557, "lr": 9.114263803680983e-06, "objective/entropy": -424.4234619140625, "objective/kl": 29.45998764038086, "objective/non_score_reward": -2.9459989070892334, "objective/rlhf_reward": -10.303043010647654, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.4935410022735596, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.7289859056472778, "step": 1386, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998971700668335 }, { "episode": 22208, "epoch": 0.3991803573354424, "loss/policy_avg": 0.41026556491851807, "lr": 9.113624744376279e-06, "objective/entropy": -261.74493408203125, "objective/kl": 26.860240936279297, "objective/non_score_reward": -2.6860241889953613, "objective/rlhf_reward": -9.010763303438821, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 35.64683532714844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7220107316970825, "step": 1387, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9977118968963623 }, { "episode": 22224, "epoch": 0.399467951252831, "loss/policy_avg": 0.9641202092170715, "lr": 9.112985685071575e-06, "objective/entropy": -86.7524185180664, "objective/kl": 35.74491882324219, "objective/non_score_reward": -3.5744919776916504, "objective/rlhf_reward": -11.897967672348024, "objective/scores": 0.6, "policy/approxkl_avg": 2.978646755218506, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.739168643951416, "step": 1388, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9987390041351318 }, { "episode": 22240, "epoch": 0.39975554517021966, "loss/policy_avg": 0.9050331711769104, "lr": 9.112346625766872e-06, "objective/entropy": -410.64483642578125, "objective/kl": 23.339214324951172, "objective/non_score_reward": -2.333921432495117, "objective/rlhf_reward": -6.935685133934021, "objective/scores": 0.6, "policy/approxkl_avg": 3.7293381690979004, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6676751375198364, "step": 1389, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.000518321990967 }, { "episode": 22256, "epoch": 0.4000431390876083, "loss/policy_avg": 0.4625541865825653, "lr": 9.111707566462168e-06, "objective/entropy": -384.5097961425781, "objective/kl": 26.76449203491211, "objective/non_score_reward": -2.6764490604400635, "objective/rlhf_reward": -8.758385489659245, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 23.613750457763672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.679041862487793, "step": 1390, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9988677501678467 }, { "episode": 22272, "epoch": 0.40033073300499694, "loss/policy_avg": -0.01955397054553032, "lr": 9.111068507157464e-06, "objective/entropy": -402.06884765625, "objective/kl": 20.775142669677734, "objective/non_score_reward": -2.0775139331817627, "objective/rlhf_reward": -6.950806343291683, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.319344997406006, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8323314189910889, "step": 1391, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999647855758667 }, { "episode": 22288, "epoch": 0.4006183269223856, "loss/policy_avg": -0.31736859679222107, "lr": 9.110429447852761e-06, "objective/entropy": -268.57232666015625, "objective/kl": 35.923954010009766, "objective/non_score_reward": -3.592395544052124, "objective/rlhf_reward": -12.945750434597102, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 9.920536994934082, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6324843168258667, "step": 1392, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0008721351623535 }, { "episode": 22304, "epoch": 0.4009059208397742, "loss/policy_avg": -0.45283621549606323, "lr": 9.109790388548058e-06, "objective/entropy": -330.77630615234375, "objective/kl": 33.07595443725586, "objective/non_score_reward": -3.3075952529907227, "objective/rlhf_reward": -11.626261506144125, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 50.805206298828125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6314573287963867, "step": 1393, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.007061004638672 }, { "episode": 22320, "epoch": 0.40119351475716286, "loss/policy_avg": -0.7062708139419556, "lr": 9.109151329243355e-06, "objective/entropy": -332.57171630859375, "objective/kl": 39.94821548461914, "objective/non_score_reward": -3.994821786880493, "objective/rlhf_reward": -14.653774056464357, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 24.905010223388672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6414631605148315, "step": 1394, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.000903606414795 }, { "episode": 22336, "epoch": 0.40148110867455156, "loss/policy_avg": 0.6870003938674927, "lr": 9.10851226993865e-06, "objective/entropy": -303.47955322265625, "objective/kl": 37.442359924316406, "objective/non_score_reward": -3.7442355155944824, "objective/rlhf_reward": -13.526344637484893, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.711573362350464, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4501544237136841, "step": 1395, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9997477531433105 }, { "episode": 22352, "epoch": 0.4017687025919402, "loss/policy_avg": 0.9254546165466309, "lr": 9.107873210633947e-06, "objective/entropy": -405.86993408203125, "objective/kl": 37.7830810546875, "objective/non_score_reward": -3.7783079147338867, "objective/rlhf_reward": -13.509111437861044, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 38.80181884765625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7633426189422607, "step": 1396, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9987798929214478 }, { "episode": 22368, "epoch": 0.40205629650932884, "loss/policy_avg": 0.5997653603553772, "lr": 9.107234151329244e-06, "objective/entropy": -254.2314453125, "objective/kl": 39.66624450683594, "objective/non_score_reward": -3.9666242599487305, "objective/rlhf_reward": -14.540984544783754, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 39.93000411987305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9109807014465332, "step": 1397, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9978280067443848 }, { "episode": 22384, "epoch": 0.4023438904267175, "loss/policy_avg": 0.11461706459522247, "lr": 9.10659509202454e-06, "objective/entropy": -247.71533203125, "objective/kl": 39.064109802246094, "objective/non_score_reward": -3.9064109325408936, "objective/rlhf_reward": -14.201811630924311, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 12.913026809692383, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7085828185081482, "step": 1398, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.9986286163330078 }, { "episode": 22400, "epoch": 0.4026314843441061, "loss/policy_avg": 0.14109595119953156, "lr": 9.105956032719838e-06, "objective/entropy": -424.5947570800781, "objective/kl": 26.497920989990234, "objective/non_score_reward": -2.649791955947876, "objective/rlhf_reward": -9.14856980285202, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 17.29753875732422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6786292791366577, "step": 1399, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9983388185501099 }, { "episode": 22416, "epoch": 0.40291907826149476, "loss/policy_avg": 0.3856104910373688, "lr": 9.105316973415133e-06, "objective/entropy": -395.4479675292969, "objective/kl": 34.77428436279297, "objective/non_score_reward": -3.477428436279297, "objective/rlhf_reward": -12.247854714811432, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 20.940475463867188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8851934671401978, "step": 1400, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.000429630279541 }, { "episode": 22432, "epoch": 0.4032066721788834, "loss/policy_avg": 0.4526877701282501, "lr": 9.10467791411043e-06, "objective/entropy": -421.24737548828125, "objective/kl": 42.90397644042969, "objective/non_score_reward": -4.290397644042969, "objective/rlhf_reward": -15.21417910881513, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.1188342571258545, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7759689688682556, "step": 1401, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998932123184204 }, { "episode": 22448, "epoch": 0.40349426609627204, "loss/policy_avg": 0.30102306604385376, "lr": 9.104038854805727e-06, "objective/entropy": -385.48809814453125, "objective/kl": 31.550262451171875, "objective/non_score_reward": -3.155026435852051, "objective/rlhf_reward": -11.220106101036071, "objective/scores": 0.35, "policy/approxkl_avg": 1.806783676147461, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6684150695800781, "step": 1402, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9999845027923584 }, { "episode": 22464, "epoch": 0.40378186001366073, "loss/policy_avg": 0.9576441049575806, "lr": 9.103399795501024e-06, "objective/entropy": -280.8646240234375, "objective/kl": 36.314903259277344, "objective/non_score_reward": -3.631490468978882, "objective/rlhf_reward": -13.102129776676264, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 16.426101684570312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5873246192932129, "step": 1403, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9986693859100342 }, { "episode": 22480, "epoch": 0.40406945393104937, "loss/policy_avg": 0.35756155848503113, "lr": 9.10276073619632e-06, "objective/entropy": -345.475341796875, "objective/kl": 46.21757507324219, "objective/non_score_reward": -4.621757507324219, "objective/rlhf_reward": -16.82517099899112, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 22.888484954833984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5141409635543823, "step": 1404, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000466823577881 }, { "episode": 22496, "epoch": 0.404357047848438, "loss/policy_avg": -0.23042599856853485, "lr": 9.102121676891617e-06, "objective/entropy": -377.4267272949219, "objective/kl": 36.089569091796875, "objective/non_score_reward": -3.608957052230835, "objective/rlhf_reward": -12.61099957970054, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 51.365867614746094, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7267637252807617, "step": 1405, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.001061201095581 }, { "episode": 22512, "epoch": 0.40464464176582665, "loss/policy_avg": 1.245417833328247, "lr": 9.101482617586912e-06, "objective/entropy": -353.95501708984375, "objective/kl": 38.99306106567383, "objective/non_score_reward": -3.899306058883667, "objective/rlhf_reward": -14.255588343649535, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.7391271591186523, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6493725776672363, "step": 1406, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.0004913806915283 }, { "episode": 22528, "epoch": 0.4049322356832153, "loss/policy_avg": 0.8556283116340637, "lr": 9.10084355828221e-06, "objective/entropy": -423.969970703125, "objective/kl": 28.88025665283203, "objective/non_score_reward": -2.8880255222320557, "objective/rlhf_reward": -10.192852461074274, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.9616515636444092, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5566847324371338, "step": 1407, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0026328563690186 }, { "episode": 22544, "epoch": 0.40521982960060393, "loss/policy_avg": 0.43810462951660156, "lr": 9.100204498977506e-06, "objective/entropy": -412.5846862792969, "objective/kl": 39.4505615234375, "objective/non_score_reward": -3.9450559616088867, "objective/rlhf_reward": -14.356391270359126, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 189.5712890625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.705894947052002, "step": 1408, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.9994113445281982 }, { "episode": 22560, "epoch": 0.40550742351799257, "loss/policy_avg": 0.06738288700580597, "lr": 9.099565439672803e-06, "objective/entropy": -339.53570556640625, "objective/kl": 33.023292541503906, "objective/non_score_reward": -3.3023290634155273, "objective/rlhf_reward": -11.883803639441652, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.846456527709961, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8043756484985352, "step": 1409, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.004558563232422 }, { "episode": 22576, "epoch": 0.40579501743538127, "loss/policy_avg": -0.77619469165802, "lr": 9.0989263803681e-06, "objective/entropy": -118.6910629272461, "objective/kl": 39.51353454589844, "objective/non_score_reward": -3.9513535499572754, "objective/rlhf_reward": -13.980584974559854, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.3107662200927734, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7634246349334717, "step": 1410, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0008883476257324 }, { "episode": 22592, "epoch": 0.4060826113527699, "loss/policy_avg": 0.45422253012657166, "lr": 9.098287321063395e-06, "objective/entropy": -354.6509094238281, "objective/kl": 31.199861526489258, "objective/non_score_reward": -3.1199862957000732, "objective/rlhf_reward": -10.532533953862128, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.734361171722412, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6763803362846375, "step": 1411, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999189853668213 }, { "episode": 22608, "epoch": 0.40637020527015855, "loss/policy_avg": -0.1583622395992279, "lr": 9.097648261758692e-06, "objective/entropy": -342.59625244140625, "objective/kl": 43.826751708984375, "objective/non_score_reward": -4.3826751708984375, "objective/rlhf_reward": -16.014927947314913, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.397621154785156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7912417650222778, "step": 1412, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9984478950500488 }, { "episode": 22624, "epoch": 0.4066577991875472, "loss/policy_avg": 1.6019296646118164, "lr": 9.097009202453987e-06, "objective/entropy": -393.14013671875, "objective/kl": 34.74468994140625, "objective/non_score_reward": -3.474468946456909, "objective/rlhf_reward": -12.341616242137505, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 25.328798294067383, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5830036401748657, "step": 1413, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.999085545539856 }, { "episode": 22640, "epoch": 0.4069453931049358, "loss/policy_avg": 0.45232057571411133, "lr": 9.096370143149284e-06, "objective/entropy": -211.57362365722656, "objective/kl": 38.22944641113281, "objective/non_score_reward": -3.8229446411132812, "objective/rlhf_reward": -13.91317615756164, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 4.637635231018066, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8866195678710938, "step": 1414, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999864935874939 }, { "episode": 22656, "epoch": 0.40723298702232447, "loss/policy_avg": 0.1311303824186325, "lr": 9.095731083844581e-06, "objective/entropy": -432.05316162109375, "objective/kl": 31.396133422851562, "objective/non_score_reward": -3.139613389968872, "objective/rlhf_reward": -11.158453321456909, "objective/scores": 0.35, "policy/approxkl_avg": 1.970788598060608, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.879388153553009, "step": 1415, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.9987220764160156 }, { "episode": 22672, "epoch": 0.4075205809397131, "loss/policy_avg": 0.007099000737071037, "lr": 9.095092024539878e-06, "objective/entropy": -424.5282287597656, "objective/kl": 27.932052612304688, "objective/non_score_reward": -2.7932052612304688, "objective/rlhf_reward": -9.510961180151092, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.861342191696167, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5432627201080322, "step": 1416, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990956783294678 }, { "episode": 22688, "epoch": 0.40780817485710175, "loss/policy_avg": -0.05355055630207062, "lr": 9.094452965235175e-06, "objective/entropy": -374.78106689453125, "objective/kl": 35.658180236816406, "objective/non_score_reward": -3.5658175945281982, "objective/rlhf_reward": -12.921634724646239, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.2089463472366333, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5137399435043335, "step": 1417, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9998743534088135 }, { "episode": 22704, "epoch": 0.40809576877449044, "loss/policy_avg": 0.5128463506698608, "lr": 9.093813905930472e-06, "objective/entropy": -356.82537841796875, "objective/kl": 34.32390213012695, "objective/non_score_reward": -3.4323902130126953, "objective/rlhf_reward": -11.90473258045585, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 18.846790313720703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8689748048782349, "step": 1418, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.000319480895996 }, { "episode": 22720, "epoch": 0.4083833626918791, "loss/policy_avg": 0.5076851844787598, "lr": 9.093174846625767e-06, "objective/entropy": -388.16070556640625, "objective/kl": 35.62187957763672, "objective/non_score_reward": -3.5621883869171143, "objective/rlhf_reward": -12.692494003978325, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.3985419273376465, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7391316890716553, "step": 1419, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9988856315612793 }, { "episode": 22736, "epoch": 0.4086709566092677, "loss/policy_avg": -0.314700186252594, "lr": 9.092535787321064e-06, "objective/entropy": -387.8660583496094, "objective/kl": 35.4585075378418, "objective/non_score_reward": -3.5458507537841797, "objective/rlhf_reward": -12.759570915897456, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.5373263359069824, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5587910413742065, "step": 1420, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.001682996749878 }, { "episode": 22752, "epoch": 0.40895855052665636, "loss/policy_avg": 0.7741593718528748, "lr": 9.09189672801636e-06, "objective/entropy": -357.4297180175781, "objective/kl": 32.32339859008789, "objective/non_score_reward": -3.232340097427368, "objective/rlhf_reward": -11.603847060233278, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 0.8878560066223145, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7675343751907349, "step": 1421, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.0009825229644775 }, { "episode": 22768, "epoch": 0.409246144444045, "loss/policy_avg": 1.130360722541809, "lr": 9.091257668711657e-06, "objective/entropy": -378.8727722167969, "objective/kl": 42.23790740966797, "objective/non_score_reward": -4.223791122436523, "objective/rlhf_reward": -15.233304505766021, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.394435882568359, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6726235151290894, "step": 1422, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988950490951538 }, { "episode": 22784, "epoch": 0.40953373836143364, "loss/policy_avg": 0.8491315245628357, "lr": 9.090618609406954e-06, "objective/entropy": -314.90924072265625, "objective/kl": 24.481353759765625, "objective/non_score_reward": -2.4481353759765625, "objective/rlhf_reward": -7.845130632595953, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 23.284061431884766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.635724663734436, "step": 1423, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9979939460754395 }, { "episode": 22800, "epoch": 0.4098213322788223, "loss/policy_avg": 0.33193281292915344, "lr": 9.08997955010225e-06, "objective/entropy": -303.7691650390625, "objective/kl": 35.59843063354492, "objective/non_score_reward": -3.559843063354492, "objective/rlhf_reward": -12.897736480742125, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 9.411911010742188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4979242980480194, "step": 1424, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9995088577270508 }, { "episode": 22816, "epoch": 0.410108926196211, "loss/policy_avg": 0.20840957760810852, "lr": 9.089340490797546e-06, "objective/entropy": -300.86102294921875, "objective/kl": 41.57798385620117, "objective/non_score_reward": -4.1577982902526855, "objective/rlhf_reward": -12.231193280220033, "objective/scores": 1.1, "policy/approxkl_avg": 190.43075561523438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5551633238792419, "step": 1425, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9975132942199707 }, { "episode": 22832, "epoch": 0.4103965201135996, "loss/policy_avg": 0.3468247056007385, "lr": 9.088701431492843e-06, "objective/entropy": -340.70361328125, "objective/kl": 37.193603515625, "objective/non_score_reward": -3.719360113143921, "objective/rlhf_reward": -12.754733862654241, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 26.998334884643555, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6371942758560181, "step": 1426, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9993436336517334 }, { "episode": 22848, "epoch": 0.41068411403098826, "loss/policy_avg": -0.15378472208976746, "lr": 9.08806237218814e-06, "objective/entropy": -331.3084716796875, "objective/kl": 26.664321899414062, "objective/non_score_reward": -2.6664319038391113, "objective/rlhf_reward": -9.287126162139279, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 2.8676066398620605, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7880427837371826, "step": 1427, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0001583099365234 }, { "episode": 22864, "epoch": 0.4109717079483769, "loss/policy_avg": 1.0802255868911743, "lr": 9.087423312883437e-06, "objective/entropy": -389.2430725097656, "objective/kl": 33.73304748535156, "objective/non_score_reward": -3.373304843902588, "objective/rlhf_reward": -12.167705807715578, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 17.666973114013672, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7094606161117554, "step": 1428, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.99909508228302 }, { "episode": 22880, "epoch": 0.41125930186576554, "loss/policy_avg": 0.6939965486526489, "lr": 9.086784253578734e-06, "objective/entropy": -375.1764221191406, "objective/kl": 42.63515853881836, "objective/non_score_reward": -4.263516426086426, "objective/rlhf_reward": -15.603466848941192, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 20.788314819335938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6509706974029541, "step": 1429, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9973926544189453 }, { "episode": 22896, "epoch": 0.4115468957831542, "loss/policy_avg": 2.086167573928833, "lr": 9.086145194274029e-06, "objective/entropy": -422.97723388671875, "objective/kl": 37.87635803222656, "objective/non_score_reward": -3.787635564804077, "objective/rlhf_reward": -13.325713510784219, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 148.28750610351562, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5761317014694214, "step": 1430, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9992225170135498 }, { "episode": 22912, "epoch": 0.4118344897005428, "loss/policy_avg": 0.33449769020080566, "lr": 9.085506134969326e-06, "objective/entropy": -386.03173828125, "objective/kl": 36.017215728759766, "objective/non_score_reward": -3.6017215251922607, "objective/rlhf_reward": -12.745026712835418, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.944709777832031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6528323888778687, "step": 1431, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998407363891602 }, { "episode": 22928, "epoch": 0.41212208361793146, "loss/policy_avg": 1.7644410133361816, "lr": 9.084867075664623e-06, "objective/entropy": -384.57171630859375, "objective/kl": 33.17778015136719, "objective/non_score_reward": -3.3177783489227295, "objective/rlhf_reward": -11.847281058033076, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.8824071884155273, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.557998538017273, "step": 1432, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9988131523132324 }, { "episode": 22944, "epoch": 0.41240967753532015, "loss/policy_avg": 0.8893593549728394, "lr": 9.08422801635992e-06, "objective/entropy": -238.5972900390625, "objective/kl": 46.162994384765625, "objective/non_score_reward": -4.616299629211426, "objective/rlhf_reward": -16.984246614392163, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 73.93292236328125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6370261907577515, "step": 1433, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.997525691986084 }, { "episode": 22960, "epoch": 0.4126972714527088, "loss/policy_avg": 0.9063405990600586, "lr": 9.083588957055215e-06, "objective/entropy": -367.1580505371094, "objective/kl": 47.45819091796875, "objective/non_score_reward": -4.745819091796875, "objective/rlhf_reward": -16.860570850149664, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 8.696590423583984, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.744605541229248, "step": 1434, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9968764781951904 }, { "episode": 22976, "epoch": 0.41298486537009743, "loss/policy_avg": 0.7586631774902344, "lr": 9.082949897750512e-06, "objective/entropy": -263.5920715332031, "objective/kl": 39.14834213256836, "objective/non_score_reward": -3.9148342609405518, "objective/rlhf_reward": -14.317701390295653, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 15.43975830078125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7210919857025146, "step": 1435, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9968843460083008 }, { "episode": 22992, "epoch": 0.41327245928748607, "loss/policy_avg": 0.2233695387840271, "lr": 9.082310838445809e-06, "objective/entropy": -331.63323974609375, "objective/kl": 34.002052307128906, "objective/non_score_reward": -3.40020489692688, "objective/rlhf_reward": -12.275307211905641, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 10.674327850341797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7023972272872925, "step": 1436, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0017385482788086 }, { "episode": 23008, "epoch": 0.4135600532048747, "loss/policy_avg": -0.37469443678855896, "lr": 9.081671779141104e-06, "objective/entropy": -431.8475341796875, "objective/kl": 34.23546600341797, "objective/non_score_reward": -3.423546552658081, "objective/rlhf_reward": -12.032326465070831, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 70.57575988769531, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5411181449890137, "step": 1437, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0012850761413574 }, { "episode": 23024, "epoch": 0.41384764712226335, "loss/policy_avg": 1.1029399633407593, "lr": 9.0810327198364e-06, "objective/entropy": -308.580810546875, "objective/kl": 42.716148376464844, "objective/non_score_reward": -4.271615028381348, "objective/rlhf_reward": -15.7272109625086, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 40.359375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5510718822479248, "step": 1438, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9983727931976318 }, { "episode": 23040, "epoch": 0.414135241039652, "loss/policy_avg": 0.8181493878364563, "lr": 9.080393660531698e-06, "objective/entropy": -453.20599365234375, "objective/kl": 33.934532165527344, "objective/non_score_reward": -3.393453359603882, "objective/rlhf_reward": -12.173813438415527, "objective/scores": 0.35, "policy/approxkl_avg": 83.23323059082031, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.5865987539291382, "step": 1439, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9991786479949951 }, { "episode": 23056, "epoch": 0.41442283495704063, "loss/policy_avg": -0.09513190388679504, "lr": 9.079754601226994e-06, "objective/entropy": -420.421875, "objective/kl": 33.76405334472656, "objective/non_score_reward": -3.376405715942383, "objective/rlhf_reward": -11.772289053599039, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.2576780319213867, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6587756872177124, "step": 1440, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9985597133636475 }, { "episode": 23072, "epoch": 0.4147104288744293, "loss/policy_avg": 0.3033369183540344, "lr": 9.079115541922291e-06, "objective/entropy": -357.442626953125, "objective/kl": 34.32968521118164, "objective/non_score_reward": -3.4329686164855957, "objective/rlhf_reward": -12.28127584895645, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.817025184631348, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6352260708808899, "step": 1441, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999590516090393 }, { "episode": 23088, "epoch": 0.41499802279181797, "loss/policy_avg": 0.5493979454040527, "lr": 9.078476482617588e-06, "objective/entropy": -384.9279479980469, "objective/kl": 26.127473831176758, "objective/non_score_reward": -2.6127474308013916, "objective/rlhf_reward": -8.894730417934015, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 17.887895584106445, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7002627849578857, "step": 1442, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9993629455566406 }, { "episode": 23104, "epoch": 0.4152856167092066, "loss/policy_avg": -0.10275371372699738, "lr": 9.077837423312883e-06, "objective/entropy": -382.1352844238281, "objective/kl": 25.628097534179688, "objective/non_score_reward": -2.562809705734253, "objective/rlhf_reward": -8.589379792631256, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 0.48566174507141113, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5493674874305725, "step": 1443, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.0012879371643066 }, { "episode": 23120, "epoch": 0.41557321062659525, "loss/policy_avg": 0.09670546650886536, "lr": 9.07719836400818e-06, "objective/entropy": -415.32354736328125, "objective/kl": 41.334068298339844, "objective/non_score_reward": -4.133406639099121, "objective/rlhf_reward": -15.052675130780102, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 48.56070327758789, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4772990047931671, "step": 1444, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9979596138000488 }, { "episode": 23136, "epoch": 0.4158608045439839, "loss/policy_avg": 0.034666746854782104, "lr": 9.076559304703477e-06, "objective/entropy": -334.47491455078125, "objective/kl": 44.47458267211914, "objective/non_score_reward": -4.447458267211914, "objective/rlhf_reward": -16.43058296415655, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 10.074474334716797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5482075214385986, "step": 1445, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.998344898223877 }, { "episode": 23152, "epoch": 0.4161483984613725, "loss/policy_avg": 1.0825122594833374, "lr": 9.075920245398774e-06, "objective/entropy": -345.6462707519531, "objective/kl": 35.18891143798828, "objective/non_score_reward": -3.518890857696533, "objective/rlhf_reward": -12.651731569965449, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 120.994384765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6722222566604614, "step": 1446, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.998002529144287 }, { "episode": 23168, "epoch": 0.41643599237876117, "loss/policy_avg": 0.706182062625885, "lr": 9.075281186094071e-06, "objective/entropy": -363.20367431640625, "objective/kl": 50.139644622802734, "objective/non_score_reward": -5.013964653015137, "objective/rlhf_reward": -17.132138405681822, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 29.070152282714844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6738229990005493, "step": 1447, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9997481107711792 }, { "episode": 23184, "epoch": 0.41672358629614986, "loss/policy_avg": 0.2096470445394516, "lr": 9.074642126789366e-06, "objective/entropy": -355.6063232421875, "objective/kl": 33.5555534362793, "objective/non_score_reward": -3.355555295944214, "objective/rlhf_reward": -9.022221183776855, "objective/scores": 1.1, "policy/approxkl_avg": 66.57122802734375, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.641074538230896, "step": 1448, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9990050792694092 }, { "episode": 23200, "epoch": 0.4170111802135385, "loss/policy_avg": 1.146085500717163, "lr": 9.074003067484663e-06, "objective/entropy": -334.9264221191406, "objective/kl": 43.11702346801758, "objective/non_score_reward": -4.311702251434326, "objective/rlhf_reward": -15.765857341702343, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 2.130558490753174, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.795293927192688, "step": 1449, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.000967502593994 }, { "episode": 23216, "epoch": 0.41729877413092714, "loss/policy_avg": 0.20703676342964172, "lr": 9.07336400817996e-06, "objective/entropy": -350.3949279785156, "objective/kl": 31.951393127441406, "objective/non_score_reward": -3.1951394081115723, "objective/rlhf_reward": -11.401954987136225, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 9.006378173828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5538038015365601, "step": 1450, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9980530738830566 }, { "episode": 23232, "epoch": 0.4175863680483158, "loss/policy_avg": 0.6031292676925659, "lr": 9.072724948875257e-06, "objective/entropy": -380.242431640625, "objective/kl": 35.23046875, "objective/non_score_reward": -3.5230472087860107, "objective/rlhf_reward": -11.168469820858213, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.640142440795898, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.8075990676879883, "step": 1451, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9993387460708618 }, { "episode": 23248, "epoch": 0.4178739619657044, "loss/policy_avg": 0.8722798824310303, "lr": 9.072085889570554e-06, "objective/entropy": -425.9102783203125, "objective/kl": 30.834789276123047, "objective/non_score_reward": -3.0834789276123047, "objective/rlhf_reward": -9.410197292209837, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 14.900014877319336, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6570580005645752, "step": 1452, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.99855637550354 }, { "episode": 23264, "epoch": 0.41816155588309306, "loss/policy_avg": 0.2141641080379486, "lr": 9.07144683026585e-06, "objective/entropy": -371.13934326171875, "objective/kl": 45.01332092285156, "objective/non_score_reward": -4.5013322830200195, "objective/rlhf_reward": -16.449069588389946, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.156445503234863, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7075440883636475, "step": 1453, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9979251623153687 }, { "episode": 23280, "epoch": 0.4184491498004817, "loss/policy_avg": 0.5542109608650208, "lr": 9.070807770961146e-06, "objective/entropy": -419.9239807128906, "objective/kl": 35.7601318359375, "objective/non_score_reward": -3.5760130882263184, "objective/rlhf_reward": -14.304052591323853, "objective/scores": 0.0, "policy/approxkl_avg": 1.910635232925415, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7288752198219299, "step": 1454, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.001138925552368 }, { "episode": 23296, "epoch": 0.41873674371787034, "loss/policy_avg": -0.3262504041194916, "lr": 9.070168711656443e-06, "objective/entropy": -350.53662109375, "objective/kl": 27.045644760131836, "objective/non_score_reward": -2.7045645713806152, "objective/rlhf_reward": -9.439655401793818, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 28.817310333251953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6510840654373169, "step": 1455, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0024008750915527 }, { "episode": 23312, "epoch": 0.41902433763525904, "loss/policy_avg": 0.07129822671413422, "lr": 9.069529652351738e-06, "objective/entropy": -406.55377197265625, "objective/kl": 41.80459976196289, "objective/non_score_reward": -4.180459976196289, "objective/rlhf_reward": -15.240887763913037, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 38.414485931396484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6523741483688354, "step": 1456, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.997438907623291 }, { "episode": 23328, "epoch": 0.4193119315526477, "loss/policy_avg": 0.302828311920166, "lr": 9.068890593047035e-06, "objective/entropy": -354.5792541503906, "objective/kl": 39.814353942871094, "objective/non_score_reward": -3.9814352989196777, "objective/rlhf_reward": -14.600228581458254, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 9.800355911254883, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6048238277435303, "step": 1457, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.99851655960083 }, { "episode": 23344, "epoch": 0.4195995254700363, "loss/policy_avg": 0.35747191309928894, "lr": 9.068251533742332e-06, "objective/entropy": -375.5713195800781, "objective/kl": 27.999340057373047, "objective/non_score_reward": -2.799934148788452, "objective/rlhf_reward": -9.595616612497883, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.7535874843597412, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6133080124855042, "step": 1458, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9992082118988037 }, { "episode": 23360, "epoch": 0.41988711938742496, "loss/policy_avg": 0.25009599328041077, "lr": 9.067612474437628e-06, "objective/entropy": -417.4339599609375, "objective/kl": 36.056114196777344, "objective/non_score_reward": -3.605611562728882, "objective/rlhf_reward": -12.998614151676264, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 39.8892822265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6061559915542603, "step": 1459, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9984492063522339 }, { "episode": 23376, "epoch": 0.4201747133048136, "loss/policy_avg": 0.8928461670875549, "lr": 9.066973415132925e-06, "objective/entropy": -345.05902099609375, "objective/kl": 48.816688537597656, "objective/non_score_reward": -4.881669044494629, "objective/rlhf_reward": -18.167426311705988, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.162357330322266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7079042196273804, "step": 1460, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9982240200042725 }, { "episode": 23392, "epoch": 0.42046230722220224, "loss/policy_avg": -0.6252841353416443, "lr": 9.06633435582822e-06, "objective/entropy": -393.3060607910156, "objective/kl": 33.32200241088867, "objective/non_score_reward": -3.332200050354004, "objective/rlhf_reward": -11.969549530480784, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.962728261947632, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6804478168487549, "step": 1461, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0001559257507324 }, { "episode": 23408, "epoch": 0.4207499011395909, "loss/policy_avg": 0.1314074993133545, "lr": 9.065695296523517e-06, "objective/entropy": -367.0579833984375, "objective/kl": 35.84513854980469, "objective/non_score_reward": -3.5845136642456055, "objective/rlhf_reward": -12.857101324017407, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.829492568969727, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8056256771087646, "step": 1462, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 21, "val/ratio": 1.9982259273529053 }, { "episode": 23424, "epoch": 0.42103749505697957, "loss/policy_avg": 0.2585163712501526, "lr": 9.065056237218814e-06, "objective/entropy": -321.7686767578125, "objective/kl": 36.79988479614258, "objective/non_score_reward": -3.679988384246826, "objective/rlhf_reward": -10.319953417778017, "objective/scores": 1.1, "policy/approxkl_avg": 1.9164154529571533, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5824712514877319, "step": 1463, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999955177307129 }, { "episode": 23440, "epoch": 0.4213250889743682, "loss/policy_avg": 0.2393825799226761, "lr": 9.064417177914111e-06, "objective/entropy": -334.32342529296875, "objective/kl": 30.760910034179688, "objective/non_score_reward": -3.0760912895202637, "objective/rlhf_reward": -10.788592898639378, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.8994873762130737, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6071294546127319, "step": 1464, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9998973608016968 }, { "episode": 23456, "epoch": 0.42161268289175685, "loss/policy_avg": -0.0033771321177482605, "lr": 9.063778118609408e-06, "objective/entropy": -260.1138610839844, "objective/kl": 39.488792419433594, "objective/non_score_reward": -3.9488797187805176, "objective/rlhf_reward": -14.436268532012384, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 17.981369018554688, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5986069440841675, "step": 1465, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.002298355102539 }, { "episode": 23472, "epoch": 0.4219002768091455, "loss/policy_avg": -0.1478525847196579, "lr": 9.063139059304705e-06, "objective/entropy": -437.809814453125, "objective/kl": 31.00763702392578, "objective/non_score_reward": -3.1007633209228516, "objective/rlhf_reward": -10.455642770008977, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 22.707975387573242, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.735325813293457, "step": 1466, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.003391981124878 }, { "episode": 23488, "epoch": 0.42218787072653413, "loss/policy_avg": -0.23670843243598938, "lr": 9.0625e-06, "objective/entropy": -349.90948486328125, "objective/kl": 40.505889892578125, "objective/non_score_reward": -4.050589561462402, "objective/rlhf_reward": -11.802356815338136, "objective/scores": 1.1, "policy/approxkl_avg": 29.180374145507812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.702825665473938, "step": 1467, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000678062438965 }, { "episode": 23504, "epoch": 0.42247546464392277, "loss/policy_avg": -0.028040185570716858, "lr": 9.061860940695297e-06, "objective/entropy": -384.9526062011719, "objective/kl": 26.610754013061523, "objective/non_score_reward": -2.6610753536224365, "objective/rlhf_reward": -9.244301891326904, "objective/scores": 0.35, "policy/approxkl_avg": 2.249297618865967, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6888667345046997, "step": 1468, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0000014305114746 }, { "episode": 23520, "epoch": 0.4227630585613114, "loss/policy_avg": 0.7756717801094055, "lr": 9.061221881390594e-06, "objective/entropy": -415.220458984375, "objective/kl": 32.89244842529297, "objective/non_score_reward": -3.2892446517944336, "objective/rlhf_reward": -11.55285898214968, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 0.9863064289093018, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6666020154953003, "step": 1469, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0010454654693604 }, { "episode": 23536, "epoch": 0.42305065247870005, "loss/policy_avg": 0.3698832392692566, "lr": 9.06058282208589e-06, "objective/entropy": -306.9219970703125, "objective/kl": 30.57228660583496, "objective/non_score_reward": -3.0572288036346436, "objective/rlhf_reward": -10.404086227687905, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 3.572977304458618, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5285166501998901, "step": 1470, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.000150442123413 }, { "episode": 23552, "epoch": 0.42333824639608875, "loss/policy_avg": 0.13718397915363312, "lr": 9.059943762781188e-06, "objective/entropy": -395.03338623046875, "objective/kl": 35.18762969970703, "objective/non_score_reward": -3.5187625885009766, "objective/rlhf_reward": -12.594098451550366, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 39.68256378173828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.49740928411483765, "step": 1471, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9984986782073975 }, { "episode": 23568, "epoch": 0.4236258403134774, "loss/policy_avg": 0.15360605716705322, "lr": 9.059304703476484e-06, "objective/entropy": -391.634033203125, "objective/kl": 45.451229095458984, "objective/non_score_reward": -4.545123100280762, "objective/rlhf_reward": -15.256773148418638, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 38.9466552734375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6432242393493652, "step": 1472, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9981634616851807 }, { "episode": 23584, "epoch": 0.423913434230866, "loss/policy_avg": 0.7477294206619263, "lr": 9.05866564417178e-06, "objective/entropy": -378.2859191894531, "objective/kl": 27.80350112915039, "objective/non_score_reward": -2.7803502082824707, "objective/rlhf_reward": -9.67080245456253, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 48.78661346435547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7658288478851318, "step": 1473, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0004773139953613 }, { "episode": 23600, "epoch": 0.42420102814825467, "loss/policy_avg": 0.042209088802337646, "lr": 9.058026584867077e-06, "objective/entropy": -304.28759765625, "objective/kl": 45.55292510986328, "objective/non_score_reward": -4.55529260635376, "objective/rlhf_reward": -16.74021792691505, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 53.71491622924805, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8910926580429077, "step": 1474, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9993290901184082 }, { "episode": 23616, "epoch": 0.4244886220656433, "loss/policy_avg": -0.3735031187534332, "lr": 9.057387525562373e-06, "objective/entropy": -382.3561096191406, "objective/kl": 32.805938720703125, "objective/non_score_reward": -3.2805943489074707, "objective/rlhf_reward": -13.122376680374146, "objective/scores": 0.0, "policy/approxkl_avg": 56.551368713378906, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.8362222909927368, "step": 1475, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9999079704284668 }, { "episode": 23632, "epoch": 0.42477621598303195, "loss/policy_avg": 0.0172857865691185, "lr": 9.05674846625767e-06, "objective/entropy": -285.6701965332031, "objective/kl": 36.98664855957031, "objective/non_score_reward": -3.698664665222168, "objective/rlhf_reward": -13.278887593539888, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.590710997581482, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.686954915523529, "step": 1476, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0004568099975586 }, { "episode": 23648, "epoch": 0.4250638099004206, "loss/policy_avg": 1.548924446105957, "lr": 9.056109406952967e-06, "objective/entropy": -378.0845947265625, "objective/kl": 39.81169891357422, "objective/non_score_reward": -3.981170177459717, "objective/rlhf_reward": -14.500848133762446, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 29.69135284423828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5625120401382446, "step": 1477, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.997599720954895 }, { "episode": 23664, "epoch": 0.4253514038178092, "loss/policy_avg": -0.6510675549507141, "lr": 9.055470347648262e-06, "objective/entropy": -426.90887451171875, "objective/kl": 35.38024139404297, "objective/non_score_reward": -3.538024425506592, "objective/rlhf_reward": -12.418763653437296, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 2.030459403991699, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6598312258720398, "step": 1478, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0019612312316895 }, { "episode": 23680, "epoch": 0.4256389977351979, "loss/policy_avg": -0.12841561436653137, "lr": 9.05483128834356e-06, "objective/entropy": -334.1355285644531, "objective/kl": 30.55511474609375, "objective/non_score_reward": -3.055511474609375, "objective/rlhf_reward": -10.822046375274658, "objective/scores": 0.35, "policy/approxkl_avg": 3.4322702884674072, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7594455480575562, "step": 1479, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.000781536102295 }, { "episode": 23696, "epoch": 0.42592659165258656, "loss/policy_avg": 1.1366994380950928, "lr": 9.054192229038854e-06, "objective/entropy": -238.75709533691406, "objective/kl": 27.7382869720459, "objective/non_score_reward": -2.7738287448883057, "objective/rlhf_reward": -9.64471636256729, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.9101102352142334, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5793651342391968, "step": 1480, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9996988773345947 }, { "episode": 23712, "epoch": 0.4262141855699752, "loss/policy_avg": 0.5514898896217346, "lr": 9.053553169734151e-06, "objective/entropy": -357.0309143066406, "objective/kl": 42.752017974853516, "objective/non_score_reward": -4.275201797485352, "objective/rlhf_reward": -15.153396676258978, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 175.21185302734375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.647923469543457, "step": 1481, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9986519813537598 }, { "episode": 23728, "epoch": 0.42650177948736384, "loss/policy_avg": 0.4780948758125305, "lr": 9.052914110429448e-06, "objective/entropy": -321.1480712890625, "objective/kl": 38.786258697509766, "objective/non_score_reward": -3.8786256313323975, "objective/rlhf_reward": -13.567092011647162, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 5.617657661437988, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5606983304023743, "step": 1482, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 17, "val/ratio": 1.9985097646713257 }, { "episode": 23744, "epoch": 0.4267893734047525, "loss/policy_avg": 0.352334588766098, "lr": 9.052275051124745e-06, "objective/entropy": -256.8111267089844, "objective/kl": 37.848670959472656, "objective/non_score_reward": -3.784867286682129, "objective/rlhf_reward": -12.739468908309938, "objective/scores": 0.6, "policy/approxkl_avg": 2.7496094703674316, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6202201843261719, "step": 1483, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9981956481933594 }, { "episode": 23760, "epoch": 0.4270769673221411, "loss/policy_avg": 0.8569170236587524, "lr": 9.051635991820042e-06, "objective/entropy": -449.56591796875, "objective/kl": 37.11693572998047, "objective/non_score_reward": -3.711693286895752, "objective/rlhf_reward": -13.446773624420167, "objective/scores": 0.35, "policy/approxkl_avg": 25.96988296508789, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8722147941589355, "step": 1484, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9972691535949707 }, { "episode": 23776, "epoch": 0.42736456123952976, "loss/policy_avg": -0.08546635508537292, "lr": 9.050996932515339e-06, "objective/entropy": -422.0863037109375, "objective/kl": 33.534671783447266, "objective/non_score_reward": -3.3534674644470215, "objective/rlhf_reward": -11.963271956057891, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.7570362091064453, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6319245100021362, "step": 1485, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0018439292907715 }, { "episode": 23792, "epoch": 0.42765215515691846, "loss/policy_avg": -0.21083611249923706, "lr": 9.050357873210634e-06, "objective/entropy": -376.9493103027344, "objective/kl": 35.88134002685547, "objective/non_score_reward": -3.5881338119506836, "objective/rlhf_reward": -12.69067574065982, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.8048529624938965, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.545556902885437, "step": 1486, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.0009307861328125 }, { "episode": 23808, "epoch": 0.4279397490743071, "loss/policy_avg": 1.1038469076156616, "lr": 9.049718813905931e-06, "objective/entropy": -380.0547790527344, "objective/kl": 41.13517761230469, "objective/non_score_reward": -4.113517761230469, "objective/rlhf_reward": -15.112436345129638, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.880563974380493, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6694772243499756, "step": 1487, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9997215270996094 }, { "episode": 23824, "epoch": 0.42822734299169574, "loss/policy_avg": 0.8747507333755493, "lr": 9.049079754601228e-06, "objective/entropy": -309.8232421875, "objective/kl": 50.309425354003906, "objective/non_score_reward": -5.030942916870117, "objective/rlhf_reward": -18.176360796170172, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 175.2151336669922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6709839105606079, "step": 1488, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9974381923675537 }, { "episode": 23840, "epoch": 0.4285149369090844, "loss/policy_avg": 0.1925617903470993, "lr": 9.048440695296525e-06, "objective/entropy": -394.33807373046875, "objective/kl": 37.35551452636719, "objective/non_score_reward": -3.735551357269287, "objective/rlhf_reward": -13.280346398771393, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 48.63822555541992, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5627095103263855, "step": 1489, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.000102996826172 }, { "episode": 23856, "epoch": 0.428802530826473, "loss/policy_avg": -0.045588746666908264, "lr": 9.047801635991821e-06, "objective/entropy": -447.3486328125, "objective/kl": 42.69718551635742, "objective/non_score_reward": -4.269718647003174, "objective/rlhf_reward": -15.655042727192011, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 48.921485900878906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7651981115341187, "step": 1490, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0018506050109863 }, { "episode": 23872, "epoch": 0.42909012474386166, "loss/policy_avg": 0.2849496603012085, "lr": 9.047162576687117e-06, "objective/entropy": -377.84576416015625, "objective/kl": 22.633804321289062, "objective/non_score_reward": -2.263380527496338, "objective/rlhf_reward": -7.5377500889622535, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 9.483132362365723, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6377484798431396, "step": 1491, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000361919403076 }, { "episode": 23888, "epoch": 0.4293777186612503, "loss/policy_avg": -0.191938579082489, "lr": 9.046523517382414e-06, "objective/entropy": -162.94802856445312, "objective/kl": 37.14765167236328, "objective/non_score_reward": -3.7147650718688965, "objective/rlhf_reward": -13.302801459041191, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 63.43394470214844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5577498078346252, "step": 1492, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001276969909668 }, { "episode": 23904, "epoch": 0.42966531257863894, "loss/policy_avg": 0.9150327444076538, "lr": 9.04588445807771e-06, "objective/entropy": -428.14874267578125, "objective/kl": 37.09174346923828, "objective/non_score_reward": -3.709174633026123, "objective/rlhf_reward": -13.386099438281402, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 6.7253804206848145, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6686770915985107, "step": 1493, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9980173110961914 }, { "episode": 23920, "epoch": 0.42995290649602763, "loss/policy_avg": 4.315652847290039, "lr": 9.045245398773007e-06, "objective/entropy": -307.2989501953125, "objective/kl": 35.56279754638672, "objective/non_score_reward": -3.5562801361083984, "objective/rlhf_reward": -12.400291557582925, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.288968086242676, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7850028872489929, "step": 1494, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.001962661743164 }, { "episode": 23936, "epoch": 0.43024050041341627, "loss/policy_avg": 0.41525718569755554, "lr": 9.044606339468304e-06, "objective/entropy": -395.378662109375, "objective/kl": 38.50458526611328, "objective/non_score_reward": -3.8504586219787598, "objective/rlhf_reward": -14.023232081023554, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 4.131414413452148, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6804065704345703, "step": 1495, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.001801013946533 }, { "episode": 23952, "epoch": 0.4305280943308049, "loss/policy_avg": -0.1491956114768982, "lr": 9.043967280163601e-06, "objective/entropy": -391.86431884765625, "objective/kl": 39.745262145996094, "objective/non_score_reward": -3.9745264053344727, "objective/rlhf_reward": -14.382333123477633, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 40.02337646484375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9267502427101135, "step": 1496, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998293161392212 }, { "episode": 23968, "epoch": 0.43081568824819355, "loss/policy_avg": 0.4301025867462158, "lr": 9.043328220858896e-06, "objective/entropy": -316.357666015625, "objective/kl": 32.892974853515625, "objective/non_score_reward": -3.289297580718994, "objective/rlhf_reward": -11.034484090582403, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 5.895191669464111, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9712218046188354, "step": 1497, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.9984525442123413 }, { "episode": 23984, "epoch": 0.4311032821655822, "loss/policy_avg": 0.9484543204307556, "lr": 9.042689161554193e-06, "objective/entropy": -411.95135498046875, "objective/kl": 35.087196350097656, "objective/non_score_reward": -3.5087196826934814, "objective/rlhf_reward": -12.675629579757135, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 4.810245037078857, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6589264273643494, "step": 1498, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9993133544921875 }, { "episode": 24000, "epoch": 0.43139087608297083, "loss/policy_avg": -0.0660000890493393, "lr": 9.04205010224949e-06, "objective/entropy": -381.00885009765625, "objective/kl": 42.001365661621094, "objective/non_score_reward": -4.200136184692383, "objective/rlhf_reward": -15.441294872497004, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 66.35794067382812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6763359308242798, "step": 1499, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9996612071990967 }, { "episode": 24016, "epoch": 0.43167847000035947, "loss/policy_avg": 0.4921269714832306, "lr": 9.041411042944787e-06, "objective/entropy": -454.1153564453125, "objective/kl": 32.006927490234375, "objective/non_score_reward": -3.200692653656006, "objective/rlhf_reward": -11.069436804453531, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.1622430086135864, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6754111051559448, "step": 1500, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.001211643218994 }, { "episode": 24032, "epoch": 0.43196606391774817, "loss/policy_avg": 0.007434425875544548, "lr": 9.040771983640082e-06, "objective/entropy": -406.6236572265625, "objective/kl": 38.99463653564453, "objective/non_score_reward": -3.899463653564453, "objective/rlhf_reward": -13.475148381964239, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.5417029857635498, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6786209344863892, "step": 1501, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9994696378707886 }, { "episode": 24048, "epoch": 0.4322536578351368, "loss/policy_avg": 0.13553458452224731, "lr": 9.040132924335379e-06, "objective/entropy": -355.88494873046875, "objective/kl": 42.831153869628906, "objective/non_score_reward": -4.283115386962891, "objective/rlhf_reward": -15.790825894385009, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 9.461507797241211, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7190350294113159, "step": 1502, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0063533782958984 }, { "episode": 24064, "epoch": 0.43254125175252545, "loss/policy_avg": -0.06194007396697998, "lr": 9.039493865030676e-06, "objective/entropy": -377.9845886230469, "objective/kl": 44.252967834472656, "objective/non_score_reward": -4.425296783447266, "objective/rlhf_reward": -16.1449279477268, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 52.05303192138672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8155778646469116, "step": 1503, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.0005385875701904 }, { "episode": 24080, "epoch": 0.4328288456699141, "loss/policy_avg": 0.03773313760757446, "lr": 9.038854805725971e-06, "objective/entropy": -408.88104248046875, "objective/kl": 39.60765838623047, "objective/non_score_reward": -3.960765838623047, "objective/rlhf_reward": -14.464462139693598, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.7364420890808105, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7957252860069275, "step": 1504, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9998862743377686 }, { "episode": 24096, "epoch": 0.4331164395873027, "loss/policy_avg": 0.4154024124145508, "lr": 9.038215746421268e-06, "objective/entropy": -405.78375244140625, "objective/kl": 27.858734130859375, "objective/non_score_reward": -2.7858738899230957, "objective/rlhf_reward": -9.692896704287872, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 26.681400299072266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6881489753723145, "step": 1505, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999143123626709 }, { "episode": 24112, "epoch": 0.43340403350469137, "loss/policy_avg": 0.024253152310848236, "lr": 9.037576687116565e-06, "objective/entropy": -328.92193603515625, "objective/kl": 32.261016845703125, "objective/non_score_reward": -3.226101875305176, "objective/rlhf_reward": -11.545157396529598, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.808337926864624, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6182868480682373, "step": 1506, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0004658699035645 }, { "episode": 24128, "epoch": 0.43369162742208, "loss/policy_avg": 0.6560850143432617, "lr": 9.036937627811862e-06, "objective/entropy": -310.0798034667969, "objective/kl": 22.40703582763672, "objective/non_score_reward": -2.240703582763672, "objective/rlhf_reward": -7.621178796797423, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 22.392059326171875, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4910814166069031, "step": 1507, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.006934881210327 }, { "episode": 24144, "epoch": 0.43397922133946865, "loss/policy_avg": -0.7266422510147095, "lr": 9.036298568507159e-06, "objective/entropy": -405.6082763671875, "objective/kl": 38.27037811279297, "objective/non_score_reward": -3.827038049697876, "objective/rlhf_reward": -13.92954955348144, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.788671612739563, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6242544054985046, "step": 1508, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0020060539245605 }, { "episode": 24160, "epoch": 0.43426681525685734, "loss/policy_avg": 4.717721462249756, "lr": 9.035659509202455e-06, "objective/entropy": -395.2735595703125, "objective/kl": 28.65047836303711, "objective/non_score_reward": -2.8650481700897217, "objective/rlhf_reward": -9.337485971228155, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.928009033203125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8637462854385376, "step": 1509, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.9996259212493896 }, { "episode": 24176, "epoch": 0.434554409174246, "loss/policy_avg": 0.6516103744506836, "lr": 9.03502044989775e-06, "objective/entropy": -377.79156494140625, "objective/kl": 42.77476501464844, "objective/non_score_reward": -4.2774763107299805, "objective/rlhf_reward": -15.768269351034789, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 50.199161529541016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7056669592857361, "step": 1510, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9978466033935547 }, { "episode": 24192, "epoch": 0.4348420030916346, "loss/policy_avg": 0.29704493284225464, "lr": 9.034381390593047e-06, "objective/entropy": -391.98162841796875, "objective/kl": 23.470632553100586, "objective/non_score_reward": -2.3470633029937744, "objective/rlhf_reward": -8.046617320089965, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 7.67216682434082, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6858037710189819, "step": 1511, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9997867345809937 }, { "episode": 24208, "epoch": 0.43512959700902326, "loss/policy_avg": 0.06189926713705063, "lr": 9.033742331288344e-06, "objective/entropy": -379.1957702636719, "objective/kl": 37.45853805541992, "objective/non_score_reward": -3.7458536624908447, "objective/rlhf_reward": -13.604812719909052, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.3426764011383057, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6298016309738159, "step": 1512, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 2.0000810623168945 }, { "episode": 24224, "epoch": 0.4354171909264119, "loss/policy_avg": 0.7977212071418762, "lr": 9.033103271983641e-06, "objective/entropy": -385.604248046875, "objective/kl": 40.92778015136719, "objective/non_score_reward": -4.092777729034424, "objective/rlhf_reward": -14.94727953215417, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.580866813659668, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7289886474609375, "step": 1513, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9995388984680176 }, { "episode": 24240, "epoch": 0.43570478484380054, "loss/policy_avg": 0.06498114764690399, "lr": 9.032464212678938e-06, "objective/entropy": -443.98974609375, "objective/kl": 43.32032775878906, "objective/non_score_reward": -4.332033157348633, "objective/rlhf_reward": -15.904299338062373, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.893068790435791, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6112293004989624, "step": 1514, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0005722045898438 }, { "episode": 24256, "epoch": 0.4359923787611892, "loss/policy_avg": 0.6366689205169678, "lr": 9.031825153374233e-06, "objective/entropy": -339.7879333496094, "objective/kl": 36.07059860229492, "objective/non_score_reward": -3.6070597171783447, "objective/rlhf_reward": -13.086603692083983, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 5.620899200439453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6315170526504517, "step": 1515, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9994146823883057 }, { "episode": 24272, "epoch": 0.4362799726785778, "loss/policy_avg": 0.9081555604934692, "lr": 9.03118609406953e-06, "objective/entropy": -403.9309387207031, "objective/kl": 47.8525390625, "objective/non_score_reward": -4.78525447845459, "objective/rlhf_reward": -17.781767093871515, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 108.16657257080078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.761695384979248, "step": 1516, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9983506202697754 }, { "episode": 24288, "epoch": 0.4365675665959665, "loss/policy_avg": -0.1702185571193695, "lr": 9.030547034764827e-06, "objective/entropy": -423.26507568359375, "objective/kl": 40.318885803222656, "objective/non_score_reward": -4.031888961791992, "objective/rlhf_reward": -14.52343705660494, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.424177885055542, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6178144216537476, "step": 1517, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0012824535369873 }, { "episode": 24304, "epoch": 0.43685516051335516, "loss/policy_avg": 1.2113523483276367, "lr": 9.029907975460124e-06, "objective/entropy": -337.96966552734375, "objective/kl": 37.78770065307617, "objective/non_score_reward": -3.7787702083587646, "objective/rlhf_reward": -12.19136134231207, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.002867698669434, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7078724503517151, "step": 1518, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9984080791473389 }, { "episode": 24320, "epoch": 0.4371427544307438, "loss/policy_avg": 0.8005465269088745, "lr": 9.02926891615542e-06, "objective/entropy": -414.1435546875, "objective/kl": 37.96477127075195, "objective/non_score_reward": -3.7964768409729004, "objective/rlhf_reward": -13.361079569133828, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 29.312976837158203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6618801355361938, "step": 1519, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988306760787964 }, { "episode": 24336, "epoch": 0.43743034834813244, "loss/policy_avg": 1.0283199548721313, "lr": 9.028629856850718e-06, "objective/entropy": -396.291015625, "objective/kl": 43.35575485229492, "objective/non_score_reward": -4.335575580596924, "objective/rlhf_reward": -15.942301905155183, "objective/scores": 0.35, "policy/approxkl_avg": 57.40699768066406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6221705079078674, "step": 1520, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0030202865600586 }, { "episode": 24352, "epoch": 0.4377179422655211, "loss/policy_avg": 0.10681764036417007, "lr": 9.027990797546013e-06, "objective/entropy": -401.71630859375, "objective/kl": 37.124122619628906, "objective/non_score_reward": -3.7124123573303223, "objective/rlhf_reward": -13.293390124049736, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 4.367173194885254, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6035778522491455, "step": 1521, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9976961612701416 }, { "episode": 24368, "epoch": 0.4380055361829097, "loss/policy_avg": -0.15628403425216675, "lr": 9.02735173824131e-06, "objective/entropy": -391.35009765625, "objective/kl": 32.41697692871094, "objective/non_score_reward": -3.2416977882385254, "objective/rlhf_reward": -11.485838535244822, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.8910417556762695, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.559136688709259, "step": 1522, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9990170001983643 }, { "episode": 24384, "epoch": 0.43829313010029836, "loss/policy_avg": 0.3668467402458191, "lr": 9.026712678936605e-06, "objective/entropy": -356.6075439453125, "objective/kl": 37.85493087768555, "objective/non_score_reward": -3.7854928970336914, "objective/rlhf_reward": -13.741972064971925, "objective/scores": 0.35, "policy/approxkl_avg": 36.07557678222656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8570120334625244, "step": 1523, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 18, "val/ratio": 1.9970687627792358 }, { "episode": 24400, "epoch": 0.43858072401768705, "loss/policy_avg": 0.3234935998916626, "lr": 9.026073619631902e-06, "objective/entropy": -310.8675537109375, "objective/kl": 39.823524475097656, "objective/non_score_reward": -3.9823524951934814, "objective/rlhf_reward": -14.44845700543678, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 96.22573852539062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.879695475101471, "step": 1524, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9995683431625366 }, { "episode": 24416, "epoch": 0.4388683179350757, "loss/policy_avg": 0.11517748236656189, "lr": 9.025434560327199e-06, "objective/entropy": -442.9134826660156, "objective/kl": 31.637725830078125, "objective/non_score_reward": -3.1637725830078125, "objective/rlhf_reward": -11.276488640395502, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.3731237649917603, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5308844447135925, "step": 1525, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.9985350370407104 }, { "episode": 24432, "epoch": 0.43915591185246433, "loss/policy_avg": 0.8228294253349304, "lr": 9.024795501022496e-06, "objective/entropy": -388.2705993652344, "objective/kl": 43.79899215698242, "objective/non_score_reward": -4.379899024963379, "objective/rlhf_reward": -15.572186301426825, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 35.425865173339844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6076502203941345, "step": 1526, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9981544017791748 }, { "episode": 24448, "epoch": 0.43944350576985297, "loss/policy_avg": 0.18832090497016907, "lr": 9.024156441717792e-06, "objective/entropy": -300.5210266113281, "objective/kl": 38.06460189819336, "objective/non_score_reward": -3.80646014213562, "objective/rlhf_reward": -13.802008469303217, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.4844846725463867, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.6342014074325562, "step": 1527, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9997735023498535 }, { "episode": 24464, "epoch": 0.4397310996872416, "loss/policy_avg": 0.4037103056907654, "lr": 9.023517382413088e-06, "objective/entropy": -284.7015075683594, "objective/kl": 36.058387756347656, "objective/non_score_reward": -3.605839252471924, "objective/rlhf_reward": -13.023356771469118, "objective/scores": 0.35, "policy/approxkl_avg": 34.16374969482422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5712857246398926, "step": 1528, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9996953010559082 }, { "episode": 24480, "epoch": 0.44001869360463025, "loss/policy_avg": 0.6154798865318298, "lr": 9.022878323108385e-06, "objective/entropy": -349.182373046875, "objective/kl": 31.549602508544922, "objective/non_score_reward": -3.1549601554870605, "objective/rlhf_reward": -11.196008761127558, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.7842953205108643, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5941054224967957, "step": 1529, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0026164054870605 }, { "episode": 24496, "epoch": 0.4403062875220189, "loss/policy_avg": 0.23553967475891113, "lr": 9.022239263803681e-06, "objective/entropy": -373.7146911621094, "objective/kl": 26.200977325439453, "objective/non_score_reward": -2.620098114013672, "objective/rlhf_reward": -6.08039174079895, "objective/scores": 1.1, "policy/approxkl_avg": 59.1917724609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.547480583190918, "step": 1530, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9977792501449585 }, { "episode": 24512, "epoch": 0.44059388143940753, "loss/policy_avg": 0.579300045967102, "lr": 9.021600204498978e-06, "objective/entropy": -461.40643310546875, "objective/kl": 35.00432586669922, "objective/non_score_reward": -3.5004329681396484, "objective/rlhf_reward": -12.05432040520185, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 15.165961265563965, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5694072246551514, "step": 1531, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9975147247314453 }, { "episode": 24528, "epoch": 0.4408814753567962, "loss/policy_avg": 0.40463489294052124, "lr": 9.020961145194275e-06, "objective/entropy": -208.33663940429688, "objective/kl": 52.09930419921875, "objective/non_score_reward": -5.209930419921875, "objective/rlhf_reward": -18.717015328184637, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 14.309218406677246, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5917267799377441, "step": 1532, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999226689338684 }, { "episode": 24544, "epoch": 0.44116906927418487, "loss/policy_avg": 14.400466918945312, "lr": 9.020322085889572e-06, "objective/entropy": -262.22564697265625, "objective/kl": 52.68130874633789, "objective/non_score_reward": -5.268130779266357, "objective/rlhf_reward": -19.74701074126355, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 5.0040693283081055, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5490331649780273, "step": 1533, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995980262756348 }, { "episode": 24560, "epoch": 0.4414566631915735, "loss/policy_avg": 1.198805332183838, "lr": 9.019683026584867e-06, "objective/entropy": -394.04034423828125, "objective/kl": 42.05086135864258, "objective/non_score_reward": -4.2050862312316895, "objective/rlhf_reward": -15.420345163345338, "objective/scores": 0.35, "policy/approxkl_avg": 76.02043151855469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6470862627029419, "step": 1534, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9996511936187744 }, { "episode": 24576, "epoch": 0.44174425710896215, "loss/policy_avg": 0.47238361835479736, "lr": 9.019043967280164e-06, "objective/entropy": -379.3162841796875, "objective/kl": 42.87568664550781, "objective/non_score_reward": -4.287569046020508, "objective/rlhf_reward": -15.546154532496054, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 25.54429817199707, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5488759875297546, "step": 1535, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9979074001312256 }, { "episode": 24592, "epoch": 0.4420318510263508, "loss/policy_avg": 1.5977990627288818, "lr": 9.018404907975461e-06, "objective/entropy": -446.0715637207031, "objective/kl": 31.530227661132812, "objective/non_score_reward": -3.1530227661132812, "objective/rlhf_reward": -11.252841078971308, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 61.80257034301758, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5237058401107788, "step": 1536, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9978086948394775 }, { "episode": 24608, "epoch": 0.4423194449437394, "loss/policy_avg": 0.9344546794891357, "lr": 9.017765848670758e-06, "objective/entropy": -319.21734619140625, "objective/kl": 48.06114196777344, "objective/non_score_reward": -4.806114196777344, "objective/rlhf_reward": -17.66819819709356, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 15.261993408203125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8439440727233887, "step": 1537, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9966486692428589 }, { "episode": 24624, "epoch": 0.44260703886112807, "loss/policy_avg": -0.36375680565834045, "lr": 9.017126789366055e-06, "objective/entropy": -447.162109375, "objective/kl": 26.631446838378906, "objective/non_score_reward": -2.663144826889038, "objective/rlhf_reward": -9.096319525447441, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 38.85789108276367, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7473317384719849, "step": 1538, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.999395728111267 }, { "episode": 24640, "epoch": 0.44289463277851676, "loss/policy_avg": 0.24433717131614685, "lr": 9.01648773006135e-06, "objective/entropy": -391.6814270019531, "objective/kl": 27.217384338378906, "objective/non_score_reward": -2.721738338470459, "objective/rlhf_reward": -9.282834086481648, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 10.176107406616211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6825709342956543, "step": 1539, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9980521202087402 }, { "episode": 24656, "epoch": 0.4431822266959054, "loss/policy_avg": 0.5150004029273987, "lr": 9.015848670756647e-06, "objective/entropy": -439.9432373046875, "objective/kl": 25.358205795288086, "objective/non_score_reward": -2.535820484161377, "objective/rlhf_reward": -8.627510392459568, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.077700138092041, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7785260081291199, "step": 1540, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9997822046279907 }, { "episode": 24672, "epoch": 0.44346982061329404, "loss/policy_avg": 1.2010408639907837, "lr": 9.015209611451944e-06, "objective/entropy": -399.0957336425781, "objective/kl": 40.07758331298828, "objective/non_score_reward": -4.007758140563965, "objective/rlhf_reward": -14.426913056437094, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 31.167686462402344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6461334228515625, "step": 1541, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.002359628677368 }, { "episode": 24688, "epoch": 0.4437574145306827, "loss/policy_avg": 1.4353135824203491, "lr": 9.01457055214724e-06, "objective/entropy": -379.08740234375, "objective/kl": 32.08915328979492, "objective/non_score_reward": -3.2089152336120605, "objective/rlhf_reward": -11.354708555157544, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 69.00767517089844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8375000357627869, "step": 1542, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9979612827301025 }, { "episode": 24704, "epoch": 0.4440450084480713, "loss/policy_avg": 0.19873923063278198, "lr": 9.013931492842537e-06, "objective/entropy": -370.26959228515625, "objective/kl": 37.65454864501953, "objective/non_score_reward": -3.7654552459716797, "objective/rlhf_reward": -12.661820507049562, "objective/scores": 0.6, "policy/approxkl_avg": 6.024623870849609, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7288262844085693, "step": 1543, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9979007244110107 }, { "episode": 24720, "epoch": 0.44433260236545996, "loss/policy_avg": 0.724319338798523, "lr": 9.013292433537834e-06, "objective/entropy": -375.3143005371094, "objective/kl": 37.62146759033203, "objective/non_score_reward": -3.7621469497680664, "objective/rlhf_reward": -13.624755222995844, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.517210841178894, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7026063203811646, "step": 1544, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0016303062438965 }, { "episode": 24736, "epoch": 0.4446201962828486, "loss/policy_avg": 1.0892194509506226, "lr": 9.01265337423313e-06, "objective/entropy": -366.7115478515625, "objective/kl": 33.899940490722656, "objective/non_score_reward": -3.3899941444396973, "objective/rlhf_reward": -12.044205271991427, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.280407190322876, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6310385465621948, "step": 1545, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999575138092041 }, { "episode": 24752, "epoch": 0.44490779020023724, "loss/policy_avg": -0.6268893480300903, "lr": 9.012014314928426e-06, "objective/entropy": -347.9535217285156, "objective/kl": 26.658119201660156, "objective/non_score_reward": -2.6658120155334473, "objective/rlhf_reward": -9.239416082103816, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 8.413990020751953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6712939739227295, "step": 1546, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0004758834838867 }, { "episode": 24768, "epoch": 0.44519538411762594, "loss/policy_avg": 0.25843602418899536, "lr": 9.011375255623722e-06, "objective/entropy": -384.2995300292969, "objective/kl": 46.968143463134766, "objective/non_score_reward": -4.69681453704834, "objective/rlhf_reward": -17.428007089827936, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 37.48634338378906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7249013185501099, "step": 1547, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9985175132751465 }, { "episode": 24784, "epoch": 0.4454829780350146, "loss/policy_avg": 0.16386398673057556, "lr": 9.010736196319018e-06, "objective/entropy": -382.41387939453125, "objective/kl": 42.38301086425781, "objective/non_score_reward": -4.2383012771606445, "objective/rlhf_reward": -16.953206300735474, "objective/scores": 0.0, "policy/approxkl_avg": 48.759620666503906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7275990843772888, "step": 1548, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000369071960449 }, { "episode": 24800, "epoch": 0.4457705719524032, "loss/policy_avg": -0.7410064935684204, "lr": 9.010097137014315e-06, "objective/entropy": -334.89776611328125, "objective/kl": 35.38103103637695, "objective/non_score_reward": -3.5381031036376953, "objective/rlhf_reward": -12.826899919539613, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 3.986295223236084, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5116535425186157, "step": 1549, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000373363494873 }, { "episode": 24816, "epoch": 0.44605816586979186, "loss/policy_avg": 0.07825727760791779, "lr": 9.009458077709612e-06, "objective/entropy": -426.83526611328125, "objective/kl": 36.107017517089844, "objective/non_score_reward": -3.6107017993927, "objective/rlhf_reward": -14.44280731678009, "objective/scores": 0.0, "policy/approxkl_avg": 53.26011657714844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5660552382469177, "step": 1550, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0002856254577637 }, { "episode": 24832, "epoch": 0.4463457597871805, "loss/policy_avg": -0.08407485485076904, "lr": 9.008819018404909e-06, "objective/entropy": -282.173583984375, "objective/kl": 39.92147445678711, "objective/non_score_reward": -3.992147445678711, "objective/rlhf_reward": -14.568590021133424, "objective/scores": 0.35, "policy/approxkl_avg": 3.9791133403778076, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6729458570480347, "step": 1551, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0009467601776123 }, { "episode": 24848, "epoch": 0.44663335370456914, "loss/policy_avg": 0.17564226686954498, "lr": 9.008179959100204e-06, "objective/entropy": -378.7695007324219, "objective/kl": 25.96942138671875, "objective/non_score_reward": -2.59694242477417, "objective/rlhf_reward": -8.87199791649216, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.114863395690918, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7612414360046387, "step": 1552, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9980354309082031 }, { "episode": 24864, "epoch": 0.4469209476219578, "loss/policy_avg": 0.5848523378372192, "lr": 9.007540899795501e-06, "objective/entropy": -320.199951171875, "objective/kl": 43.87555694580078, "objective/non_score_reward": -4.3875555992126465, "objective/rlhf_reward": -15.993962853160454, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 7.26108455657959, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8406010270118713, "step": 1553, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9988974332809448 }, { "episode": 24880, "epoch": 0.4472085415393464, "loss/policy_avg": -0.10538221895694733, "lr": 9.006901840490798e-06, "objective/entropy": -392.84027099609375, "objective/kl": 29.05345916748047, "objective/non_score_reward": -2.905345916748047, "objective/rlhf_reward": -9.221384382247924, "objective/scores": 0.6, "policy/approxkl_avg": 3.9698846340179443, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5678784847259521, "step": 1554, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.00101900100708 }, { "episode": 24896, "epoch": 0.4474961354567351, "loss/policy_avg": -0.03046274185180664, "lr": 9.006262781186095e-06, "objective/entropy": -407.0494384765625, "objective/kl": 31.977157592773438, "objective/non_score_reward": -3.1977155208587646, "objective/rlhf_reward": -10.843451092915473, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 62.80748748779297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5845105648040771, "step": 1555, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000979900360107 }, { "episode": 24912, "epoch": 0.44778372937412375, "loss/policy_avg": 0.6718517541885376, "lr": 9.005623721881392e-06, "objective/entropy": -403.23638916015625, "objective/kl": 46.62294006347656, "objective/non_score_reward": -4.662293434143066, "objective/rlhf_reward": -16.915841833750406, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 13.390661239624023, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7493401765823364, "step": 1556, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9986774921417236 }, { "episode": 24928, "epoch": 0.4480713232915124, "loss/policy_avg": 0.6341782808303833, "lr": 9.004984662576689e-06, "objective/entropy": -394.71429443359375, "objective/kl": 32.41244888305664, "objective/non_score_reward": -3.2412452697753906, "objective/rlhf_reward": -8.564980483055116, "objective/scores": 1.1, "policy/approxkl_avg": 4.848804473876953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7392096519470215, "step": 1557, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999105453491211 }, { "episode": 24944, "epoch": 0.44835891720890103, "loss/policy_avg": 0.9781434535980225, "lr": 9.004345603271984e-06, "objective/entropy": -412.18023681640625, "objective/kl": 25.61279296875, "objective/non_score_reward": -2.561279296875, "objective/rlhf_reward": -8.640997443262654, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 27.94587516784668, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5848729610443115, "step": 1558, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0003609657287598 }, { "episode": 24960, "epoch": 0.44864651112628967, "loss/policy_avg": 0.3365482687950134, "lr": 9.00370654396728e-06, "objective/entropy": -404.7320251464844, "objective/kl": 39.955684661865234, "objective/non_score_reward": -3.9955685138702393, "objective/rlhf_reward": -11.58227334022522, "objective/scores": 1.1, "policy/approxkl_avg": 10.052909851074219, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7171013355255127, "step": 1559, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9992034435272217 }, { "episode": 24976, "epoch": 0.4489341050436783, "loss/policy_avg": -0.04976231977343559, "lr": 9.003067484662578e-06, "objective/entropy": -413.3092041015625, "objective/kl": 36.63374710083008, "objective/non_score_reward": -3.663374900817871, "objective/rlhf_reward": -12.92016638914744, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.8754290342330933, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5858293771743774, "step": 1560, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0008695125579834 }, { "episode": 24992, "epoch": 0.44922169896106695, "loss/policy_avg": 0.8078259825706482, "lr": 9.002428425357874e-06, "objective/entropy": -367.67608642578125, "objective/kl": 42.71329116821289, "objective/non_score_reward": -4.271328926086426, "objective/rlhf_reward": -14.685316658020021, "objective/scores": 0.6, "policy/approxkl_avg": 24.521787643432617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4760279953479767, "step": 1561, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982843399047852 }, { "episode": 25008, "epoch": 0.44950929287845565, "loss/policy_avg": -0.9788036346435547, "lr": 9.001789366053171e-06, "objective/entropy": -399.4076232910156, "objective/kl": 40.31508255004883, "objective/non_score_reward": -4.031508445739746, "objective/rlhf_reward": -14.301204438480447, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.9122161865234375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6039241552352905, "step": 1562, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0087552070617676 }, { "episode": 25024, "epoch": 0.4497968867958443, "loss/policy_avg": 0.5489621758460999, "lr": 9.001150306748467e-06, "objective/entropy": -415.7235107421875, "objective/kl": 36.78160858154297, "objective/non_score_reward": -3.678161144256592, "objective/rlhf_reward": -12.31264433860779, "objective/scores": 0.6, "policy/approxkl_avg": 10.603200912475586, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5127561092376709, "step": 1563, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9992984533309937 }, { "episode": 25040, "epoch": 0.4500844807132329, "loss/policy_avg": 0.3486945927143097, "lr": 9.000511247443763e-06, "objective/entropy": -403.9288330078125, "objective/kl": 24.092403411865234, "objective/non_score_reward": -2.40924072265625, "objective/rlhf_reward": -8.277712428306026, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 10.847503662109375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6689282655715942, "step": 1564, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9980721473693848 }, { "episode": 25056, "epoch": 0.45037207463062157, "loss/policy_avg": 0.5966230630874634, "lr": 8.99987218813906e-06, "objective/entropy": -154.16067504882812, "objective/kl": 51.647193908691406, "objective/non_score_reward": -5.164719581604004, "objective/rlhf_reward": -19.25887999534607, "objective/scores": 0.35, "policy/approxkl_avg": 151.61903381347656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5423076152801514, "step": 1565, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9976308345794678 }, { "episode": 25072, "epoch": 0.4506596685480102, "loss/policy_avg": -0.2641555666923523, "lr": 8.999233128834357e-06, "objective/entropy": -430.33917236328125, "objective/kl": 47.79615020751953, "objective/non_score_reward": -4.77961540222168, "objective/rlhf_reward": -17.60268934944504, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.918701648712158, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.687942385673523, "step": 1566, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9991919994354248 }, { "episode": 25088, "epoch": 0.45094726246539885, "loss/policy_avg": 0.009062454104423523, "lr": 8.998594069529654e-06, "objective/entropy": -402.27972412109375, "objective/kl": 42.953285217285156, "objective/non_score_reward": -4.295328617095947, "objective/rlhf_reward": -15.802712061492304, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 84.20780944824219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7290328741073608, "step": 1567, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9985921382904053 }, { "episode": 25104, "epoch": 0.4512348563827875, "loss/policy_avg": 1.0277724266052246, "lr": 8.99795501022495e-06, "objective/entropy": -346.07989501953125, "objective/kl": 41.89537048339844, "objective/non_score_reward": -4.189537048339844, "objective/rlhf_reward": -15.096288209379303, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 15.69582462310791, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6725494265556335, "step": 1568, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9986522197723389 }, { "episode": 25120, "epoch": 0.4515224503001761, "loss/policy_avg": 1.604515552520752, "lr": 8.997315950920246e-06, "objective/entropy": -390.841796875, "objective/kl": 46.394317626953125, "objective/non_score_reward": -4.639431476593018, "objective/rlhf_reward": -17.179123976317744, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 20.32146453857422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6986252069473267, "step": 1569, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 17, "val/ratio": 1.9976015090942383 }, { "episode": 25136, "epoch": 0.4518100442175648, "loss/policy_avg": 0.07045532017946243, "lr": 8.996676891615543e-06, "objective/entropy": -407.0528259277344, "objective/kl": 26.508319854736328, "objective/non_score_reward": -2.650831937789917, "objective/rlhf_reward": -9.179495651920405, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.8644967079162598, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6746435165405273, "step": 1570, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.000070571899414 }, { "episode": 25152, "epoch": 0.45209763813495346, "loss/policy_avg": 0.6469980478286743, "lr": 8.996037832310838e-06, "objective/entropy": -373.18463134765625, "objective/kl": 42.834129333496094, "objective/non_score_reward": -4.283413410186768, "objective/rlhf_reward": -15.755050995437006, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 37.90662384033203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7600852251052856, "step": 1571, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9987890720367432 }, { "episode": 25168, "epoch": 0.4523852320523421, "loss/policy_avg": 0.0857057124376297, "lr": 8.995398773006135e-06, "objective/entropy": -404.57757568359375, "objective/kl": 44.52537155151367, "objective/non_score_reward": -4.452537536621094, "objective/rlhf_reward": -16.076816336313883, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.3558183908462524, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5978577136993408, "step": 1572, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9993383884429932 }, { "episode": 25184, "epoch": 0.45267282596973074, "loss/policy_avg": -0.13575121760368347, "lr": 8.994759713701432e-06, "objective/entropy": -374.91009521484375, "objective/kl": 41.43658447265625, "objective/non_score_reward": -4.143658638000488, "objective/rlhf_reward": -15.124034981341705, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.9335005283355713, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5603492259979248, "step": 1573, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.001004219055176 }, { "episode": 25200, "epoch": 0.4529604198871194, "loss/policy_avg": 0.6448495984077454, "lr": 8.994120654396729e-06, "objective/entropy": -395.01287841796875, "objective/kl": 45.77439880371094, "objective/non_score_reward": -4.57744026184082, "objective/rlhf_reward": -16.753500549998833, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 37.59837341308594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7212796211242676, "step": 1574, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.998678207397461 }, { "episode": 25216, "epoch": 0.453248013804508, "loss/policy_avg": 0.6448776721954346, "lr": 8.993481595092026e-06, "objective/entropy": -439.9145202636719, "objective/kl": 32.40359878540039, "objective/non_score_reward": -3.2403600215911865, "objective/rlhf_reward": -10.037721072078917, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 15.634969711303711, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6747137308120728, "step": 1575, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9976146221160889 }, { "episode": 25232, "epoch": 0.45353560772189666, "loss/policy_avg": 0.3177574574947357, "lr": 8.992842535787321e-06, "objective/entropy": -291.8227233886719, "objective/kl": 43.445552825927734, "objective/non_score_reward": -4.344555377960205, "objective/rlhf_reward": -15.927623133273467, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.222565174102783, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4807971715927124, "step": 1576, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999102354049683 }, { "episode": 25248, "epoch": 0.45382320163928536, "loss/policy_avg": 0.014031085185706615, "lr": 8.992203476482618e-06, "objective/entropy": -360.3704833984375, "objective/kl": 31.09727668762207, "objective/non_score_reward": -3.1097278594970703, "objective/rlhf_reward": -8.038911199569702, "objective/scores": 1.1, "policy/approxkl_avg": 1.4646967649459839, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7990518808364868, "step": 1577, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.9993723630905151 }, { "episode": 25264, "epoch": 0.454110795556674, "loss/policy_avg": -0.22409991919994354, "lr": 8.991564417177915e-06, "objective/entropy": -377.56353759765625, "objective/kl": 38.716087341308594, "objective/non_score_reward": -3.8716084957122803, "objective/rlhf_reward": -14.127184354995173, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 2.8400111198425293, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6347936391830444, "step": 1578, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9981651306152344 }, { "episode": 25280, "epoch": 0.45439838947406264, "loss/policy_avg": 0.1815337836742401, "lr": 8.990925357873212e-06, "objective/entropy": -362.4527587890625, "objective/kl": 46.21480941772461, "objective/non_score_reward": -4.621480941772461, "objective/rlhf_reward": -16.661094064983438, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 142.86077880859375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8845950961112976, "step": 1579, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.998314619064331 }, { "episode": 25296, "epoch": 0.4546859833914513, "loss/policy_avg": -0.011334492824971676, "lr": 8.990286298568508e-06, "objective/entropy": -440.6584167480469, "objective/kl": 38.82865524291992, "objective/non_score_reward": -3.8828654289245605, "objective/rlhf_reward": -14.050509097988963, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 4.592876434326172, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.675102710723877, "step": 1580, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0002188682556152 }, { "episode": 25312, "epoch": 0.4549735773088399, "loss/policy_avg": 0.6402010917663574, "lr": 8.989647239263805e-06, "objective/entropy": -381.83685302734375, "objective/kl": 48.979705810546875, "objective/non_score_reward": -4.897971153259277, "objective/rlhf_reward": -17.93002367538272, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 88.46405792236328, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6586170196533203, "step": 1581, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991507530212402 }, { "episode": 25328, "epoch": 0.45526117122622856, "loss/policy_avg": 0.3545636534690857, "lr": 8.9890081799591e-06, "objective/entropy": -415.12860107421875, "objective/kl": 44.440547943115234, "objective/non_score_reward": -4.44405460357666, "objective/rlhf_reward": -16.32562170466934, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 9.411672592163086, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6674659252166748, "step": 1582, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9976952075958252 }, { "episode": 25344, "epoch": 0.4555487651436172, "loss/policy_avg": 0.2763954699039459, "lr": 8.988369120654397e-06, "objective/entropy": -378.57073974609375, "objective/kl": 36.687042236328125, "objective/non_score_reward": -3.6687042713165283, "objective/rlhf_reward": -13.22421894511734, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 10.387723922729492, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5623950958251953, "step": 1583, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9960174560546875 }, { "episode": 25360, "epoch": 0.45583635906100584, "loss/policy_avg": 0.3340342938899994, "lr": 8.987730061349694e-06, "objective/entropy": -325.21282958984375, "objective/kl": 43.267295837402344, "objective/non_score_reward": -4.326729774475098, "objective/rlhf_reward": -15.856321196170196, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 8.629939079284668, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5923956632614136, "step": 1584, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9963407516479492 }, { "episode": 25376, "epoch": 0.45612395297839453, "loss/policy_avg": 0.6136425733566284, "lr": 8.987091002044991e-06, "objective/entropy": -387.4258117675781, "objective/kl": 39.17667770385742, "objective/non_score_reward": -3.917667865753174, "objective/rlhf_reward": -14.292069771376948, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 34.836273193359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6000803709030151, "step": 1585, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9973785877227783 }, { "episode": 25392, "epoch": 0.45641154689578317, "loss/policy_avg": 0.8685510158538818, "lr": 8.986451942740288e-06, "objective/entropy": -445.90032958984375, "objective/kl": 32.107032775878906, "objective/non_score_reward": -3.2107033729553223, "objective/rlhf_reward": -10.895402382092414, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 43.68373107910156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5711070895195007, "step": 1586, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9977147579193115 }, { "episode": 25408, "epoch": 0.4566991408131718, "loss/policy_avg": 1.2374792098999023, "lr": 8.985812883435585e-06, "objective/entropy": -269.6192626953125, "objective/kl": 40.86695861816406, "objective/non_score_reward": -4.086696147918701, "objective/rlhf_reward": -14.9229529692727, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.214241027832031, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5778703689575195, "step": 1587, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9991469383239746 }, { "episode": 25424, "epoch": 0.45698673473056045, "loss/policy_avg": -0.056829970329999924, "lr": 8.98517382413088e-06, "objective/entropy": -411.3571472167969, "objective/kl": 18.695568084716797, "objective/non_score_reward": -1.8695566654205322, "objective/rlhf_reward": -6.099624731627804, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 17.709747314453125, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5861658453941345, "step": 1588, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0019216537475586 }, { "episode": 25440, "epoch": 0.4572743286479491, "loss/policy_avg": 0.26971131563186646, "lr": 8.984534764826177e-06, "objective/entropy": -419.8878173828125, "objective/kl": 27.937755584716797, "objective/non_score_reward": -2.793775796890259, "objective/rlhf_reward": -8.775103187561035, "objective/scores": 0.6, "policy/approxkl_avg": 0.518993079662323, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7896235585212708, "step": 1589, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0011515617370605 }, { "episode": 25456, "epoch": 0.45756192256533773, "loss/policy_avg": 0.37710195779800415, "lr": 8.983895705521472e-06, "objective/entropy": -405.4755859375, "objective/kl": 35.878684997558594, "objective/non_score_reward": -3.5878684520721436, "objective/rlhf_reward": -12.228767575995002, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 4.3146467208862305, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8733147978782654, "step": 1590, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0001065731048584 }, { "episode": 25472, "epoch": 0.45784951648272637, "loss/policy_avg": 0.2632831037044525, "lr": 8.983256646216769e-06, "objective/entropy": -414.04534912109375, "objective/kl": 30.546842575073242, "objective/non_score_reward": -3.0546841621398926, "objective/rlhf_reward": -10.818737363815309, "objective/scores": 0.35, "policy/approxkl_avg": 48.852638244628906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7985514402389526, "step": 1591, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.998450517654419 }, { "episode": 25488, "epoch": 0.458137110400115, "loss/policy_avg": 0.6826393604278564, "lr": 8.982617586912066e-06, "objective/entropy": -374.9160461425781, "objective/kl": 21.128250122070312, "objective/non_score_reward": -2.1128249168395996, "objective/rlhf_reward": -7.109664252310424, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.942084789276123, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6855432987213135, "step": 1592, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999008297920227 }, { "episode": 25504, "epoch": 0.4584247043175037, "loss/policy_avg": 0.38613730669021606, "lr": 8.981978527607363e-06, "objective/entropy": -449.263916015625, "objective/kl": 28.967899322509766, "objective/non_score_reward": -2.896790027618408, "objective/rlhf_reward": -7.18716070652008, "objective/scores": 1.1, "policy/approxkl_avg": 26.061811447143555, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6411184072494507, "step": 1593, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9987527132034302 }, { "episode": 25520, "epoch": 0.45871229823489235, "loss/policy_avg": 1.1185591220855713, "lr": 8.98133946830266e-06, "objective/entropy": -430.1148681640625, "objective/kl": 25.81922149658203, "objective/non_score_reward": -2.5819222927093506, "objective/rlhf_reward": -8.502860422405313, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 39.203208923339844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6846965551376343, "step": 1594, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.998095154762268 }, { "episode": 25536, "epoch": 0.458999892152281, "loss/policy_avg": 2.3710274696350098, "lr": 8.980700408997955e-06, "objective/entropy": -407.2514343261719, "objective/kl": 36.31953430175781, "objective/non_score_reward": -3.631953239440918, "objective/rlhf_reward": -12.794480101267496, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.217879295349121, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6199887990951538, "step": 1595, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.003922462463379 }, { "episode": 25552, "epoch": 0.4592874860696696, "loss/policy_avg": 0.3695710301399231, "lr": 8.980061349693252e-06, "objective/entropy": -477.0921936035156, "objective/kl": 33.26344299316406, "objective/non_score_reward": -3.3263444900512695, "objective/rlhf_reward": -11.824425104077221, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 0.7949667572975159, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6483457088470459, "step": 1596, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9995753765106201 }, { "episode": 25568, "epoch": 0.45957507998705827, "loss/policy_avg": -0.23511981964111328, "lr": 8.979422290388549e-06, "objective/entropy": -337.3599853515625, "objective/kl": 40.92012023925781, "objective/non_score_reward": -4.092012405395508, "objective/rlhf_reward": -14.763928446833212, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 30.296527862548828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.68705153465271, "step": 1597, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9993513822555542 }, { "episode": 25584, "epoch": 0.4598626739044469, "loss/policy_avg": 1.0916402339935303, "lr": 8.978783231083845e-06, "objective/entropy": -359.0259704589844, "objective/kl": 34.8113899230957, "objective/non_score_reward": -3.4811391830444336, "objective/rlhf_reward": -12.599043879538698, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.189365863800049, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6944392919540405, "step": 1598, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9979948997497559 }, { "episode": 25600, "epoch": 0.46015026782183555, "loss/policy_avg": -0.46432065963745117, "lr": 8.978144171779142e-06, "objective/entropy": -309.70654296875, "objective/kl": 41.30950164794922, "objective/non_score_reward": -4.130950450897217, "objective/rlhf_reward": -16.523802280426025, "objective/scores": 0.0, "policy/approxkl_avg": 6.908711910247803, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8937714695930481, "step": 1599, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.0006086826324463 }, { "episode": 25616, "epoch": 0.46043786173922424, "loss/policy_avg": 0.4081665277481079, "lr": 8.97750511247444e-06, "objective/entropy": -129.34173583984375, "objective/kl": 41.58888244628906, "objective/non_score_reward": -4.158888816833496, "objective/rlhf_reward": -14.235553836822511, "objective/scores": 0.6, "policy/approxkl_avg": 2.1702589988708496, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9154573678970337, "step": 1600, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.999558925628662 }, { "episode": 25632, "epoch": 0.4607254556566129, "loss/policy_avg": 0.6155873537063599, "lr": 8.976866053169734e-06, "objective/entropy": -313.52239990234375, "objective/kl": 34.87145233154297, "objective/non_score_reward": -3.487145185470581, "objective/rlhf_reward": -12.39232179423864, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 13.90162467956543, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7837454080581665, "step": 1601, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9977222681045532 }, { "episode": 25648, "epoch": 0.4610130495740015, "loss/policy_avg": 0.49341073632240295, "lr": 8.976226993865031e-06, "objective/entropy": -230.7957763671875, "objective/kl": 40.743282318115234, "objective/non_score_reward": -4.074328422546387, "objective/rlhf_reward": -14.563979641596475, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 21.229673385620117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7661559581756592, "step": 1602, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.998582363128662 }, { "episode": 25664, "epoch": 0.46130064349139016, "loss/policy_avg": 0.3956504464149475, "lr": 8.975587934560328e-06, "objective/entropy": -335.97332763671875, "objective/kl": 40.16059112548828, "objective/non_score_reward": -4.016058921813965, "objective/rlhf_reward": -14.239407654079507, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 3.294480323791504, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5648142099380493, "step": 1603, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998520851135254 }, { "episode": 25680, "epoch": 0.4615882374087788, "loss/policy_avg": 0.16773171722888947, "lr": 8.974948875255625e-06, "objective/entropy": -411.7926025390625, "objective/kl": 29.645999908447266, "objective/non_score_reward": -2.964600086212158, "objective/rlhf_reward": -9.458400344848632, "objective/scores": 0.6, "policy/approxkl_avg": 123.7798843383789, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6723917722702026, "step": 1604, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9958925247192383 }, { "episode": 25696, "epoch": 0.46187583132616744, "loss/policy_avg": 0.44738197326660156, "lr": 8.974309815950922e-06, "objective/entropy": -356.6579284667969, "objective/kl": 38.58180618286133, "objective/non_score_reward": -3.8581807613372803, "objective/rlhf_reward": -12.509004031063292, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 8.898035049438477, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6891041994094849, "step": 1605, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.013833999633789 }, { "episode": 25712, "epoch": 0.4621634252435561, "loss/policy_avg": 0.5292553305625916, "lr": 8.973670756646217e-06, "objective/entropy": -358.66217041015625, "objective/kl": 29.649044036865234, "objective/non_score_reward": -2.964904308319092, "objective/rlhf_reward": -10.378665152008892, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 20.77273941040039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6345843076705933, "step": 1606, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9974372386932373 }, { "episode": 25728, "epoch": 0.4624510191609447, "loss/policy_avg": 0.15684904158115387, "lr": 8.973031697341514e-06, "objective/entropy": -421.99114990234375, "objective/kl": 29.710756301879883, "objective/non_score_reward": -2.9710757732391357, "objective/rlhf_reward": -10.222443585813629, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 21.654502868652344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5846576690673828, "step": 1607, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9976410865783691 }, { "episode": 25744, "epoch": 0.4627386130783334, "loss/policy_avg": 0.3718082904815674, "lr": 8.97239263803681e-06, "objective/entropy": -365.65020751953125, "objective/kl": 32.760765075683594, "objective/non_score_reward": -3.2760767936706543, "objective/rlhf_reward": -11.778793845206422, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 15.140457153320312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7173216342926025, "step": 1608, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998669624328613 }, { "episode": 25760, "epoch": 0.46302620699572206, "loss/policy_avg": 1.6529536247253418, "lr": 8.971753578732108e-06, "objective/entropy": -386.7452087402344, "objective/kl": 42.47675704956055, "objective/non_score_reward": -4.247675895690918, "objective/rlhf_reward": -15.434443204608513, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 5.0789055824279785, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6534456610679626, "step": 1609, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.999765396118164 }, { "episode": 25776, "epoch": 0.4633138009131107, "loss/policy_avg": 0.4760410785675049, "lr": 8.971114519427405e-06, "objective/entropy": -380.49420166015625, "objective/kl": 34.94993591308594, "objective/non_score_reward": -3.4949936866760254, "objective/rlhf_reward": -12.529377083392486, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.227126121520996, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8223279714584351, "step": 1610, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 18, "val/ratio": 1.9985172748565674 }, { "episode": 25792, "epoch": 0.46360139483049934, "loss/policy_avg": 0.8326834440231323, "lr": 8.970475460122701e-06, "objective/entropy": -419.07684326171875, "objective/kl": 38.1279296875, "objective/non_score_reward": -3.812793254852295, "objective/rlhf_reward": -12.327453766704771, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 8.165489196777344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7635027170181274, "step": 1611, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9982054233551025 }, { "episode": 25808, "epoch": 0.463888988747888, "loss/policy_avg": 0.10125212371349335, "lr": 8.969836400817997e-06, "objective/entropy": -386.31097412109375, "objective/kl": 37.03303146362305, "objective/non_score_reward": -3.7033028602600098, "objective/rlhf_reward": -13.43461022624145, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.7963939905166626, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.662887692451477, "step": 1612, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.000887393951416 }, { "episode": 25824, "epoch": 0.4641765826652766, "loss/policy_avg": 0.044641196727752686, "lr": 8.969197341513294e-06, "objective/entropy": -374.7705993652344, "objective/kl": 32.699886322021484, "objective/non_score_reward": -3.269988536834717, "objective/rlhf_reward": -13.079954385757446, "objective/scores": 0.0, "policy/approxkl_avg": 0.46452978253364563, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5399535298347473, "step": 1613, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0002245903015137 }, { "episode": 25840, "epoch": 0.46446417658266526, "loss/policy_avg": 1.6687601804733276, "lr": 8.968558282208589e-06, "objective/entropy": -372.2134094238281, "objective/kl": 42.785064697265625, "objective/non_score_reward": -4.278506278991699, "objective/rlhf_reward": -14.714023685455324, "objective/scores": 0.6, "policy/approxkl_avg": 39.34197235107422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8211818337440491, "step": 1614, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990718364715576 }, { "episode": 25856, "epoch": 0.46475177050005395, "loss/policy_avg": -0.13284818828105927, "lr": 8.967919222903886e-06, "objective/entropy": -350.61322021484375, "objective/kl": 33.671878814697266, "objective/non_score_reward": -3.367187976837158, "objective/rlhf_reward": -12.109501683448237, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 0.5211295485496521, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6782361268997192, "step": 1615, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000903606414795 }, { "episode": 25872, "epoch": 0.4650393644174426, "loss/policy_avg": -0.13549593091011047, "lr": 8.967280163599182e-06, "objective/entropy": -383.4450378417969, "objective/kl": 34.53531265258789, "objective/non_score_reward": -3.453530788421631, "objective/rlhf_reward": -11.691417636648687, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 18.708654403686523, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8225675821304321, "step": 1616, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.99953031539917 }, { "episode": 25888, "epoch": 0.46532695833483123, "loss/policy_avg": 0.1365768015384674, "lr": 8.96664110429448e-06, "objective/entropy": -439.84228515625, "objective/kl": 34.50129699707031, "objective/non_score_reward": -3.450129747390747, "objective/rlhf_reward": -12.47500637534253, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 0.35066476464271545, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6784589290618896, "step": 1617, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.00114107131958 }, { "episode": 25904, "epoch": 0.46561455225221987, "loss/policy_avg": 0.7598201036453247, "lr": 8.966002044989776e-06, "objective/entropy": -449.16802978515625, "objective/kl": 39.740623474121094, "objective/non_score_reward": -3.974062442779541, "objective/rlhf_reward": -13.496250128746034, "objective/scores": 0.6, "policy/approxkl_avg": 13.567388534545898, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6520213484764099, "step": 1618, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0001142024993896 }, { "episode": 25920, "epoch": 0.4659021461696085, "loss/policy_avg": 0.891502857208252, "lr": 8.965362985685071e-06, "objective/entropy": -400.0162353515625, "objective/kl": 42.8731689453125, "objective/non_score_reward": -4.287317276000977, "objective/rlhf_reward": -12.749268150329591, "objective/scores": 1.1, "policy/approxkl_avg": 94.71224975585938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.681228756904602, "step": 1619, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.001039505004883 }, { "episode": 25936, "epoch": 0.46618974008699715, "loss/policy_avg": 0.029704689979553223, "lr": 8.964723926380368e-06, "objective/entropy": -430.4644775390625, "objective/kl": 31.889232635498047, "objective/non_score_reward": -3.1889233589172363, "objective/rlhf_reward": -11.15157321459444, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.903954982757568, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5122766494750977, "step": 1620, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9980919361114502 }, { "episode": 25952, "epoch": 0.4664773340043858, "loss/policy_avg": 0.8742834329605103, "lr": 8.964084867075665e-06, "objective/entropy": -404.28045654296875, "objective/kl": 31.80949592590332, "objective/non_score_reward": -3.1809496879577637, "objective/rlhf_reward": -11.364548647139948, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.1881070137023926, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7785500288009644, "step": 1621, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.000067710876465 }, { "episode": 25968, "epoch": 0.46676492792177443, "loss/policy_avg": 0.7548739314079285, "lr": 8.963445807770962e-06, "objective/entropy": -384.36724853515625, "objective/kl": 47.42879867553711, "objective/non_score_reward": -4.742879867553711, "objective/rlhf_reward": -17.309660439909088, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 57.19119644165039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7312213182449341, "step": 1622, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9973558187484741 }, { "episode": 25984, "epoch": 0.4670525218391631, "loss/policy_avg": 0.9261447191238403, "lr": 8.962806748466259e-06, "objective/entropy": -410.57012939453125, "objective/kl": 46.177703857421875, "objective/non_score_reward": -4.617770195007324, "objective/rlhf_reward": -17.11183115217535, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.585106372833252, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7124975323677063, "step": 1623, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.002890110015869 }, { "episode": 26000, "epoch": 0.46734011575655177, "loss/policy_avg": -0.03239623084664345, "lr": 8.962167689161556e-06, "objective/entropy": -381.62664794921875, "objective/kl": 34.84297561645508, "objective/non_score_reward": -3.484297275543213, "objective/rlhf_reward": -12.456237199719311, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 0.7469508647918701, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6316899061203003, "step": 1624, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0006675720214844 }, { "episode": 26016, "epoch": 0.4676277096739404, "loss/policy_avg": 1.745779275894165, "lr": 8.961528629856851e-06, "objective/entropy": -156.83627319335938, "objective/kl": 43.46995544433594, "objective/non_score_reward": -4.346995830535889, "objective/rlhf_reward": -15.654649511973062, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 114.80679321289062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7031311988830566, "step": 1625, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9976086616516113 }, { "episode": 26032, "epoch": 0.46791530359132905, "loss/policy_avg": 0.024723999202251434, "lr": 8.960889570552148e-06, "objective/entropy": -384.3182067871094, "objective/kl": 36.431339263916016, "objective/non_score_reward": -3.6431338787078857, "objective/rlhf_reward": -12.968416009012776, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.5224108695983887, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626000165939331, "step": 1626, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9998475313186646 }, { "episode": 26048, "epoch": 0.4682028975087177, "loss/policy_avg": 0.24203763902187347, "lr": 8.960250511247445e-06, "objective/entropy": -400.3022155761719, "objective/kl": 30.770477294921875, "objective/non_score_reward": -3.077047824859619, "objective/rlhf_reward": -10.704071197573262, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 66.42471313476562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5930522680282593, "step": 1627, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9991860389709473 }, { "episode": 26064, "epoch": 0.4684904914261063, "loss/policy_avg": -0.7072806358337402, "lr": 8.959611451942742e-06, "objective/entropy": -360.388916015625, "objective/kl": 39.62144470214844, "objective/non_score_reward": -3.962144374847412, "objective/rlhf_reward": -14.367625358517529, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 32.40711212158203, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6506911516189575, "step": 1628, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0159692764282227 }, { "episode": 26080, "epoch": 0.46877808534349497, "loss/policy_avg": 2.6838910579681396, "lr": 8.958972392638038e-06, "objective/entropy": -378.97686767578125, "objective/kl": 36.97150802612305, "objective/non_score_reward": -3.697150945663452, "objective/rlhf_reward": -14.78860354423523, "objective/scores": 0.0, "policy/approxkl_avg": 49.857078552246094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5898784399032593, "step": 1629, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9984610080718994 }, { "episode": 26096, "epoch": 0.4690656792608836, "loss/policy_avg": 0.28691649436950684, "lr": 8.958333333333334e-06, "objective/entropy": -573.7208251953125, "objective/kl": 19.57809829711914, "objective/non_score_reward": -1.9578100442886353, "objective/rlhf_reward": -6.2749807526737005, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 22.093029022216797, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.5744526982307434, "step": 1630, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.99820876121521 }, { "episode": 26112, "epoch": 0.4693532731782723, "loss/policy_avg": 0.4899155795574188, "lr": 8.95769427402863e-06, "objective/entropy": -414.6656799316406, "objective/kl": 36.486228942871094, "objective/non_score_reward": -3.648622989654541, "objective/rlhf_reward": -13.078720176013645, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.530874729156494, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7301163077354431, "step": 1631, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9970190525054932 }, { "episode": 26128, "epoch": 0.46964086709566094, "loss/policy_avg": -0.06641054153442383, "lr": 8.957055214723927e-06, "objective/entropy": -440.5696716308594, "objective/kl": 38.2086181640625, "objective/non_score_reward": -3.820862054824829, "objective/rlhf_reward": -13.80249512475288, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.76316499710083, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7395534515380859, "step": 1632, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0009098052978516 }, { "episode": 26144, "epoch": 0.4699284610130496, "loss/policy_avg": 1.6136388778686523, "lr": 8.956416155419224e-06, "objective/entropy": -375.7001953125, "objective/kl": 36.05951690673828, "objective/non_score_reward": -3.6059517860412598, "objective/rlhf_reward": -12.598978514942239, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.612539768218994, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6019783616065979, "step": 1633, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0007171630859375 }, { "episode": 26160, "epoch": 0.4702160549304382, "loss/policy_avg": 3.5966718196868896, "lr": 8.95577709611452e-06, "objective/entropy": -343.9623107910156, "objective/kl": 39.71470642089844, "objective/non_score_reward": -3.971470594406128, "objective/rlhf_reward": -15.88588297367096, "objective/scores": 0.0, "policy/approxkl_avg": 9.776708602905273, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6924200654029846, "step": 1634, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.000486135482788 }, { "episode": 26176, "epoch": 0.47050364884782686, "loss/policy_avg": 0.9908069372177124, "lr": 8.955138036809816e-06, "objective/entropy": -456.3717956542969, "objective/kl": 38.72023391723633, "objective/non_score_reward": -3.872023820877075, "objective/rlhf_reward": -15.488094568252563, "objective/scores": 0.0, "policy/approxkl_avg": 4.954434871673584, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.659642219543457, "step": 1635, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.997072696685791 }, { "episode": 26192, "epoch": 0.4707912427652155, "loss/policy_avg": 0.03370996564626694, "lr": 8.954498977505113e-06, "objective/entropy": -268.5799560546875, "objective/kl": 32.134056091308594, "objective/non_score_reward": -3.2134053707122803, "objective/rlhf_reward": -10.906210730748112, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.953232526779175, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5509767532348633, "step": 1636, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9998369216918945 }, { "episode": 26208, "epoch": 0.47107883668260414, "loss/policy_avg": 0.7738806009292603, "lr": 8.95385991820041e-06, "objective/entropy": -482.381591796875, "objective/kl": 30.14999008178711, "objective/non_score_reward": -3.0149989128112793, "objective/rlhf_reward": -7.659996247291565, "objective/scores": 1.1, "policy/approxkl_avg": 2.2431159019470215, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.507556140422821, "step": 1637, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9998831748962402 }, { "episode": 26224, "epoch": 0.47136643059999284, "loss/policy_avg": 0.8606960773468018, "lr": 8.953220858895705e-06, "objective/entropy": -383.1812744140625, "objective/kl": 32.99481201171875, "objective/non_score_reward": -3.2994816303253174, "objective/rlhf_reward": -11.856290867834716, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 7.782444953918457, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.596083402633667, "step": 1638, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.0005035400390625 }, { "episode": 26240, "epoch": 0.4716540245173815, "loss/policy_avg": -0.1339363008737564, "lr": 8.952581799591002e-06, "objective/entropy": -417.17938232421875, "objective/kl": 35.73338317871094, "objective/non_score_reward": -3.573338508605957, "objective/rlhf_reward": -12.89335379600525, "objective/scores": 0.35, "policy/approxkl_avg": 9.185393333435059, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7601564526557922, "step": 1639, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.001448631286621 }, { "episode": 26256, "epoch": 0.4719416184347701, "loss/policy_avg": 3.864933967590332, "lr": 8.951942740286299e-06, "objective/entropy": -427.02362060546875, "objective/kl": 24.26729965209961, "objective/non_score_reward": -2.426729679107666, "objective/rlhf_reward": -8.381406579047365, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.5686416625976562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.603689968585968, "step": 1640, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0048418045043945 }, { "episode": 26272, "epoch": 0.47222921235215876, "loss/policy_avg": 0.4509657025337219, "lr": 8.951303680981596e-06, "objective/entropy": -406.6365966796875, "objective/kl": 43.62321472167969, "objective/non_score_reward": -4.362321853637695, "objective/rlhf_reward": -15.501877139286933, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 19.274425506591797, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5924656987190247, "step": 1641, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9960191249847412 }, { "episode": 26288, "epoch": 0.4725168062695474, "loss/policy_avg": 0.46149808168411255, "lr": 8.950664621676893e-06, "objective/entropy": -433.11029052734375, "objective/kl": 33.15770721435547, "objective/non_score_reward": -3.3157711029052734, "objective/rlhf_reward": -11.706825106349541, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 4.765190124511719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6916393041610718, "step": 1642, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9986909627914429 }, { "episode": 26304, "epoch": 0.47280440018693604, "loss/policy_avg": 0.44238945841789246, "lr": 8.950025562372188e-06, "objective/entropy": -349.730224609375, "objective/kl": 30.80122184753418, "objective/non_score_reward": -3.080122470855713, "objective/rlhf_reward": -10.658630614698517, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.498382091522217, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4885851740837097, "step": 1643, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990676641464233 }, { "episode": 26320, "epoch": 0.4730919941043247, "loss/policy_avg": 2.4708251953125, "lr": 8.949386503067485e-06, "objective/entropy": -404.8670654296875, "objective/kl": 41.547725677490234, "objective/non_score_reward": -4.1547722816467285, "objective/rlhf_reward": -16.619089722633362, "objective/scores": 0.0, "policy/approxkl_avg": 2.221585988998413, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6881966590881348, "step": 1644, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.000181198120117 }, { "episode": 26336, "epoch": 0.4733795880217133, "loss/policy_avg": -0.11322785913944244, "lr": 8.948747443762782e-06, "objective/entropy": -399.2702941894531, "objective/kl": 40.40675735473633, "objective/non_score_reward": -4.040676116943359, "objective/rlhf_reward": -14.784101822463374, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.1996610164642334, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5444244146347046, "step": 1645, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001070976257324 }, { "episode": 26352, "epoch": 0.473667181939102, "loss/policy_avg": -0.2234870344400406, "lr": 8.948108384458079e-06, "objective/entropy": -434.2123718261719, "objective/kl": 34.17063522338867, "objective/non_score_reward": -3.4170637130737305, "objective/rlhf_reward": -11.545548977629217, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.8733255863189697, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5498039722442627, "step": 1646, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9997615814208984 }, { "episode": 26368, "epoch": 0.47395477585649065, "loss/policy_avg": 0.8130428791046143, "lr": 8.947469325153376e-06, "objective/entropy": -398.3437805175781, "objective/kl": 31.40152359008789, "objective/non_score_reward": -3.1401524543762207, "objective/rlhf_reward": -11.235096726447267, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.1398074626922607, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6039716601371765, "step": 1647, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0010716915130615 }, { "episode": 26384, "epoch": 0.4742423697738793, "loss/policy_avg": -0.01273877453058958, "lr": 8.946830265848672e-06, "objective/entropy": -417.34912109375, "objective/kl": 32.46980285644531, "objective/non_score_reward": -3.2469801902770996, "objective/rlhf_reward": -11.506968858654856, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.277264595031738, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6654943823814392, "step": 1648, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9999809265136719 }, { "episode": 26400, "epoch": 0.47452996369126793, "loss/policy_avg": -0.3940402865409851, "lr": 8.946191206543968e-06, "objective/entropy": -430.748046875, "objective/kl": 33.73677062988281, "objective/non_score_reward": -3.3736772537231445, "objective/rlhf_reward": -10.570989881397459, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.314667224884033, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6480484008789062, "step": 1649, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0017786026000977 }, { "episode": 26416, "epoch": 0.47481755760865657, "loss/policy_avg": -0.2192322015762329, "lr": 8.945552147239264e-06, "objective/entropy": -383.8127136230469, "objective/kl": 41.51525115966797, "objective/non_score_reward": -4.151525020599365, "objective/rlhf_reward": -12.206099963188173, "objective/scores": 1.1, "policy/approxkl_avg": 4.288912773132324, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6321976780891418, "step": 1650, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0026891231536865 }, { "episode": 26432, "epoch": 0.4751051515260452, "loss/policy_avg": -0.1348821520805359, "lr": 8.944913087934561e-06, "objective/entropy": -409.4119873046875, "objective/kl": 35.97928237915039, "objective/non_score_reward": -3.597928524017334, "objective/rlhf_reward": -12.967881996830073, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.7316250801086426, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6831116676330566, "step": 1651, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001095771789551 }, { "episode": 26448, "epoch": 0.47539274544343385, "loss/policy_avg": 0.31589481234550476, "lr": 8.944274028629858e-06, "objective/entropy": -399.6732482910156, "objective/kl": 40.63943862915039, "objective/non_score_reward": -4.063943862915039, "objective/rlhf_reward": -16.25577664375305, "objective/scores": 0.0, "policy/approxkl_avg": 6.397206783294678, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5949341058731079, "step": 1652, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9979043006896973 }, { "episode": 26464, "epoch": 0.47568033936082255, "loss/policy_avg": 0.9684159755706787, "lr": 8.943634969325155e-06, "objective/entropy": -416.2860412597656, "objective/kl": 41.30329132080078, "objective/non_score_reward": -4.130329132080078, "objective/rlhf_reward": -15.142714359847407, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 7.341388702392578, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6481591463088989, "step": 1653, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999350666999817 }, { "episode": 26480, "epoch": 0.4759679332782112, "loss/policy_avg": 0.6937155723571777, "lr": 8.94299591002045e-06, "objective/entropy": -392.06768798828125, "objective/kl": 47.32278060913086, "objective/non_score_reward": -4.732278347015381, "objective/rlhf_reward": -14.529113388061525, "objective/scores": 1.1, "policy/approxkl_avg": 55.0672492980957, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7007068395614624, "step": 1654, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9982314109802246 }, { "episode": 26496, "epoch": 0.4762555271955998, "loss/policy_avg": 0.015181057155132294, "lr": 8.942356850715747e-06, "objective/entropy": -280.1114501953125, "objective/kl": 35.78889083862305, "objective/non_score_reward": -3.5788888931274414, "objective/rlhf_reward": -12.368144820408757, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.2872930765151978, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6389068961143494, "step": 1655, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9995758533477783 }, { "episode": 26512, "epoch": 0.47654312111298847, "loss/policy_avg": 0.37557125091552734, "lr": 8.941717791411042e-06, "objective/entropy": -435.42230224609375, "objective/kl": 40.69313049316406, "objective/non_score_reward": -4.069313049316406, "objective/rlhf_reward": -14.329841445164618, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 5.111576080322266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6841574907302856, "step": 1656, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9989839792251587 }, { "episode": 26528, "epoch": 0.4768307150303771, "loss/policy_avg": 0.8275701999664307, "lr": 8.94107873210634e-06, "objective/entropy": -368.5299987792969, "objective/kl": 38.08189392089844, "objective/non_score_reward": -3.808189630508423, "objective/rlhf_reward": -12.832758045196535, "objective/scores": 0.6, "policy/approxkl_avg": 11.50629997253418, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7231966257095337, "step": 1657, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9985672235488892 }, { "episode": 26544, "epoch": 0.47711830894776575, "loss/policy_avg": 0.1858597993850708, "lr": 8.940439672801636e-06, "objective/entropy": -468.83892822265625, "objective/kl": 32.060089111328125, "objective/non_score_reward": -3.2060089111328125, "objective/rlhf_reward": -11.219915542666037, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 18.871475219726562, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7692112326622009, "step": 1658, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.9997926950454712 }, { "episode": 26560, "epoch": 0.4774059028651544, "loss/policy_avg": -0.7411110401153564, "lr": 8.939800613496933e-06, "objective/entropy": -426.9635009765625, "objective/kl": 35.84244918823242, "objective/non_score_reward": -3.584244966506958, "objective/rlhf_reward": -12.675120120466339, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 9.95809268951416, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6828428506851196, "step": 1659, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.004059314727783 }, { "episode": 26576, "epoch": 0.477693496782543, "loss/policy_avg": -0.4879923462867737, "lr": 8.93916155419223e-06, "objective/entropy": -439.90228271484375, "objective/kl": 36.388763427734375, "objective/non_score_reward": -3.638876438140869, "objective/rlhf_reward": -12.155505514144899, "objective/scores": 0.6, "policy/approxkl_avg": 6.3313446044921875, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6424757242202759, "step": 1660, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0021562576293945 }, { "episode": 26592, "epoch": 0.4779810906999317, "loss/policy_avg": 1.116358995437622, "lr": 8.938522494887527e-06, "objective/entropy": -446.7737731933594, "objective/kl": 44.409584045410156, "objective/non_score_reward": -4.440958499908447, "objective/rlhf_reward": -16.313235740275726, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 14.082149505615234, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6858080625534058, "step": 1661, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9980788230895996 }, { "episode": 26608, "epoch": 0.47826868461732036, "loss/policy_avg": 0.5042970180511475, "lr": 8.937883435582822e-06, "objective/entropy": -422.8186340332031, "objective/kl": 32.531585693359375, "objective/non_score_reward": -3.2531585693359375, "objective/rlhf_reward": -11.634032585708002, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.2845988273620605, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5514823198318481, "step": 1662, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9991204738616943 }, { "episode": 26624, "epoch": 0.478556278534709, "loss/policy_avg": 0.09836432337760925, "lr": 8.937244376278119e-06, "objective/entropy": -376.92218017578125, "objective/kl": 28.976757049560547, "objective/non_score_reward": -2.8976755142211914, "objective/rlhf_reward": -9.928843384206878, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.142619013786316, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6947767734527588, "step": 1663, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000258207321167 }, { "episode": 26640, "epoch": 0.47884387245209764, "loss/policy_avg": -0.6851586103439331, "lr": 8.936605316973416e-06, "objective/entropy": -343.80810546875, "objective/kl": 34.99475860595703, "objective/non_score_reward": -3.4994759559631348, "objective/rlhf_reward": -11.875197353140386, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 4.260616302490234, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5976920127868652, "step": 1664, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.003633499145508 }, { "episode": 26656, "epoch": 0.4791314663694863, "loss/policy_avg": 0.6990913152694702, "lr": 8.935966257668713e-06, "objective/entropy": -386.11004638671875, "objective/kl": 35.65720748901367, "objective/non_score_reward": -3.565720796585083, "objective/rlhf_reward": -12.315471957402167, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.7904255390167236, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4942004680633545, "step": 1665, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9990780353546143 }, { "episode": 26672, "epoch": 0.4794190602868749, "loss/policy_avg": 0.25033342838287354, "lr": 8.93532719836401e-06, "objective/entropy": -388.11590576171875, "objective/kl": 36.51459503173828, "objective/non_score_reward": -3.651459217071533, "objective/rlhf_reward": -13.124884965832592, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 6.214205265045166, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5733066201210022, "step": 1666, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9993984699249268 }, { "episode": 26688, "epoch": 0.47970665420426356, "loss/policy_avg": 0.8446806073188782, "lr": 8.934688139059305e-06, "objective/entropy": -435.40191650390625, "objective/kl": 40.17815399169922, "objective/non_score_reward": -4.017815589904785, "objective/rlhf_reward": -14.745748553305788, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 6.942691802978516, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6258660554885864, "step": 1667, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9993536472320557 }, { "episode": 26704, "epoch": 0.4799942481216522, "loss/policy_avg": 1.5240944623947144, "lr": 8.934049079754602e-06, "objective/entropy": -412.3753662109375, "objective/kl": 33.70470428466797, "objective/non_score_reward": -3.3704700469970703, "objective/rlhf_reward": -13.481880903244019, "objective/scores": 0.0, "policy/approxkl_avg": 30.208845138549805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6115098595619202, "step": 1668, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9997292757034302 }, { "episode": 26720, "epoch": 0.4802818420390409, "loss/policy_avg": 0.12263950705528259, "lr": 8.933410020449898e-06, "objective/entropy": -448.531982421875, "objective/kl": 36.315582275390625, "objective/non_score_reward": -3.631558418273926, "objective/rlhf_reward": -12.922113452021199, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.192168235778809, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6477067470550537, "step": 1669, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9979054927825928 }, { "episode": 26736, "epoch": 0.48056943595642954, "loss/policy_avg": 0.29384303092956543, "lr": 8.932770961145195e-06, "objective/entropy": -411.04254150390625, "objective/kl": 48.54298400878906, "objective/non_score_reward": -4.854298114776611, "objective/rlhf_reward": -15.017191982269289, "objective/scores": 1.1, "policy/approxkl_avg": 6.444131851196289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6500333547592163, "step": 1670, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0007288455963135 }, { "episode": 26752, "epoch": 0.4808570298738182, "loss/policy_avg": 0.861343502998352, "lr": 8.932131901840492e-06, "objective/entropy": -393.406982421875, "objective/kl": 32.83056640625, "objective/non_score_reward": -3.2830562591552734, "objective/rlhf_reward": -11.790589859991698, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 30.80994987487793, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7001285552978516, "step": 1671, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9988535642623901 }, { "episode": 26768, "epoch": 0.4811446237912068, "loss/policy_avg": 0.2407083511352539, "lr": 8.931492842535789e-06, "objective/entropy": -356.23590087890625, "objective/kl": 39.6117057800293, "objective/non_score_reward": -3.9611706733703613, "objective/rlhf_reward": -13.444682216644289, "objective/scores": 0.6, "policy/approxkl_avg": 2.4046785831451416, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7408854961395264, "step": 1672, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9999974966049194 }, { "episode": 26784, "epoch": 0.48143221770859546, "loss/policy_avg": 0.18417704105377197, "lr": 8.930853783231084e-06, "objective/entropy": -461.93011474609375, "objective/kl": 44.24816131591797, "objective/non_score_reward": -4.424816131591797, "objective/rlhf_reward": -16.37375167372815, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.5959041118621826, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6895890235900879, "step": 1673, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9999454021453857 }, { "episode": 26800, "epoch": 0.4817198116259841, "loss/policy_avg": 0.5731082558631897, "lr": 8.930214723926381e-06, "objective/entropy": -417.73431396484375, "objective/kl": 46.00732421875, "objective/non_score_reward": -4.600732803344727, "objective/rlhf_reward": -15.479212675930235, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 0.7374999523162842, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6999803781509399, "step": 1674, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998290538787842 }, { "episode": 26816, "epoch": 0.48200740554337274, "loss/policy_avg": -0.5741149187088013, "lr": 8.929575664621678e-06, "objective/entropy": -441.59649658203125, "objective/kl": 33.28999328613281, "objective/non_score_reward": -3.3289995193481445, "objective/rlhf_reward": -11.8002258179509, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.020813465118408, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5520445108413696, "step": 1675, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.00361967086792 }, { "episode": 26832, "epoch": 0.48229499946076143, "loss/policy_avg": 0.22925391793251038, "lr": 8.928936605316975e-06, "objective/entropy": -370.97686767578125, "objective/kl": 32.112850189208984, "objective/non_score_reward": -3.21128511428833, "objective/rlhf_reward": -11.421308119495478, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.470270156860352, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6254950761795044, "step": 1676, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0004844665527344 }, { "episode": 26848, "epoch": 0.48258259337815007, "loss/policy_avg": 1.4506404399871826, "lr": 8.928297546012272e-06, "objective/entropy": -392.0498046875, "objective/kl": 37.194297790527344, "objective/non_score_reward": -3.7194297313690186, "objective/rlhf_reward": -12.477719402313234, "objective/scores": 0.6, "policy/approxkl_avg": 3.003965139389038, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5682520866394043, "step": 1677, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9981834888458252 }, { "episode": 26864, "epoch": 0.4828701872955387, "loss/policy_avg": 1.2458202838897705, "lr": 8.927658486707567e-06, "objective/entropy": -311.18060302734375, "objective/kl": 39.92573547363281, "objective/non_score_reward": -3.9925734996795654, "objective/rlhf_reward": -14.611043894027155, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 38.68024444580078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6542336344718933, "step": 1678, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9966943264007568 }, { "episode": 26880, "epoch": 0.48315778121292735, "loss/policy_avg": 0.5574089288711548, "lr": 8.927019427402864e-06, "objective/entropy": -396.7320861816406, "objective/kl": 27.356082916259766, "objective/non_score_reward": -2.7356083393096924, "objective/rlhf_reward": -9.600797822981505, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 7.95252799987793, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6256705522537231, "step": 1679, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9978477954864502 }, { "episode": 26896, "epoch": 0.483445375130316, "loss/policy_avg": -0.2597196400165558, "lr": 8.926380368098159e-06, "objective/entropy": -416.4184875488281, "objective/kl": 39.364662170410156, "objective/non_score_reward": -3.9364664554595947, "objective/rlhf_reward": -14.084006314695465, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 12.438138961791992, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7305759191513062, "step": 1680, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.000302314758301 }, { "episode": 26912, "epoch": 0.48373296904770463, "loss/policy_avg": 0.8952155113220215, "lr": 8.925741308793456e-06, "objective/entropy": -461.25653076171875, "objective/kl": 35.6180419921875, "objective/non_score_reward": -3.5618045330047607, "objective/rlhf_reward": -12.766265752728344, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 7.684481620788574, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7476506233215332, "step": 1681, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998413324356079 }, { "episode": 26928, "epoch": 0.48402056296509327, "loss/policy_avg": 0.6283873319625854, "lr": 8.925102249488753e-06, "objective/entropy": -428.31463623046875, "objective/kl": 43.77210998535156, "objective/non_score_reward": -4.37721061706543, "objective/rlhf_reward": -15.952584355083061, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.92600154876709, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.591777503490448, "step": 1682, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9992499351501465 }, { "episode": 26944, "epoch": 0.4843081568824819, "loss/policy_avg": 0.6489890813827515, "lr": 8.92446319018405e-06, "objective/entropy": -413.67279052734375, "objective/kl": 36.5638427734375, "objective/non_score_reward": -3.6563844680786133, "objective/rlhf_reward": -12.225537157058717, "objective/scores": 0.6, "policy/approxkl_avg": 3.818236827850342, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6381638646125793, "step": 1683, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9984195232391357 }, { "episode": 26960, "epoch": 0.4845957507998706, "loss/policy_avg": 4.54330587387085, "lr": 8.923824130879346e-06, "objective/entropy": -272.0236511230469, "objective/kl": 41.632476806640625, "objective/non_score_reward": -4.163247585296631, "objective/rlhf_reward": -15.22915848036584, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.9071102142333984, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.5447160005569458, "step": 1684, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0005366802215576 }, { "episode": 26976, "epoch": 0.48488334471725925, "loss/policy_avg": 0.0395740270614624, "lr": 8.923185071574643e-06, "objective/entropy": -419.14031982421875, "objective/kl": 31.555559158325195, "objective/non_score_reward": -3.1555559635162354, "objective/rlhf_reward": -11.262973749373836, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 10.734424591064453, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.685627818107605, "step": 1685, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0008254051208496 }, { "episode": 26992, "epoch": 0.4851709386346479, "loss/policy_avg": 0.8629701137542725, "lr": 8.922546012269939e-06, "objective/entropy": -376.2289123535156, "objective/kl": 45.655975341796875, "objective/non_score_reward": -4.5655975341796875, "objective/rlhf_reward": -15.862389421463014, "objective/scores": 0.6, "policy/approxkl_avg": 21.249679565429688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7637636661529541, "step": 1686, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.9983651638031006 }, { "episode": 27008, "epoch": 0.4854585325520365, "loss/policy_avg": 1.6487116813659668, "lr": 8.921906952965235e-06, "objective/entropy": -407.13372802734375, "objective/kl": 21.96820831298828, "objective/non_score_reward": -2.1968207359313965, "objective/rlhf_reward": -6.839871833996709, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.874408721923828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8231643438339233, "step": 1687, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.0046801567077637 }, { "episode": 27024, "epoch": 0.48574612646942517, "loss/policy_avg": -0.12748177349567413, "lr": 8.921267893660532e-06, "objective/entropy": -448.079345703125, "objective/kl": 32.80928039550781, "objective/non_score_reward": -3.280928134918213, "objective/rlhf_reward": -11.78207629015985, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 13.005249977111816, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5742599964141846, "step": 1688, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0072314739227295 }, { "episode": 27040, "epoch": 0.4860337203868138, "loss/policy_avg": 0.17407271265983582, "lr": 8.92062883435583e-06, "objective/entropy": -420.4717712402344, "objective/kl": 20.723045349121094, "objective/non_score_reward": -2.0723047256469727, "objective/rlhf_reward": -8.289219051599503, "objective/scores": 0.0, "policy/approxkl_avg": 1.855107069015503, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8479450941085815, "step": 1689, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 18, "val/ratio": 2.0018362998962402 }, { "episode": 27056, "epoch": 0.48632131430420245, "loss/policy_avg": 0.41411519050598145, "lr": 8.919989775051126e-06, "objective/entropy": -426.39788818359375, "objective/kl": 43.23339080810547, "objective/non_score_reward": -4.323339462280273, "objective/rlhf_reward": -15.914754250136713, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.036855697631836, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7080724239349365, "step": 1690, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.000317096710205 }, { "episode": 27072, "epoch": 0.48660890822159114, "loss/policy_avg": 1.2218260765075684, "lr": 8.919350715746421e-06, "objective/entropy": -419.58160400390625, "objective/kl": 27.427894592285156, "objective/non_score_reward": -2.7427897453308105, "objective/rlhf_reward": -9.5205604835466, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 5.919201850891113, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.6099730730056763, "step": 1691, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9993302822113037 }, { "episode": 27088, "epoch": 0.4868965021389798, "loss/policy_avg": 0.2703090310096741, "lr": 8.918711656441718e-06, "objective/entropy": -399.06488037109375, "objective/kl": 43.026756286621094, "objective/non_score_reward": -4.302675724029541, "objective/rlhf_reward": -15.606582436625082, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.5643582344055176, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.48464542627334595, "step": 1692, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9990031719207764 }, { "episode": 27104, "epoch": 0.4871840960563684, "loss/policy_avg": -0.34561437368392944, "lr": 8.918072597137015e-06, "objective/entropy": -359.4370422363281, "objective/kl": 37.78188705444336, "objective/non_score_reward": -3.778188705444336, "objective/rlhf_reward": -13.287926073345254, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.005827903747559, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5592657327651978, "step": 1693, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0002658367156982 }, { "episode": 27120, "epoch": 0.48747168997375706, "loss/policy_avg": 1.6910521984100342, "lr": 8.917433537832312e-06, "objective/entropy": -308.5474853515625, "objective/kl": 42.24267578125, "objective/non_score_reward": -4.224267959594727, "objective/rlhf_reward": -15.416118028576733, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.8454639911651611, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.721234917640686, "step": 1694, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 1.9995944499969482 }, { "episode": 27136, "epoch": 0.4877592838911457, "loss/policy_avg": -0.43421345949172974, "lr": 8.916794478527609e-06, "objective/entropy": -388.4516296386719, "objective/kl": 35.90379333496094, "objective/non_score_reward": -3.590379238128662, "objective/rlhf_reward": -13.00226708624212, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.91756272315979, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.637759804725647, "step": 1695, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0017752647399902 }, { "episode": 27152, "epoch": 0.48804687780853434, "loss/policy_avg": 0.13800783455371857, "lr": 8.916155419222906e-06, "objective/entropy": -386.30670166015625, "objective/kl": 44.80225372314453, "objective/non_score_reward": -4.480225563049316, "objective/rlhf_reward": -15.520902729034425, "objective/scores": 0.6, "policy/approxkl_avg": 17.814830780029297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6229501962661743, "step": 1696, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9979759454727173 }, { "episode": 27168, "epoch": 0.488334471725923, "loss/policy_avg": -0.6147864460945129, "lr": 8.9155163599182e-06, "objective/entropy": -359.4217224121094, "objective/kl": 40.8199348449707, "objective/non_score_reward": -4.081993579864502, "objective/rlhf_reward": -13.40425482833502, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 22.895889282226562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7785426378250122, "step": 1697, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.001692533493042 }, { "episode": 27184, "epoch": 0.4886220656433116, "loss/policy_avg": 0.3894260823726654, "lr": 8.914877300613498e-06, "objective/entropy": -448.70770263671875, "objective/kl": 38.545040130615234, "objective/non_score_reward": -3.854504108428955, "objective/rlhf_reward": -13.593187446865151, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.749185562133789, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6764988899230957, "step": 1698, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.99898099899292 }, { "episode": 27200, "epoch": 0.4889096595607003, "loss/policy_avg": 0.20305383205413818, "lr": 8.914238241308795e-06, "objective/entropy": -399.7395935058594, "objective/kl": 40.19321060180664, "objective/non_score_reward": -4.019320964813232, "objective/rlhf_reward": -13.677283620834352, "objective/scores": 0.6, "policy/approxkl_avg": 11.945358276367188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6444295644760132, "step": 1699, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.007512092590332 }, { "episode": 27216, "epoch": 0.48919725347808896, "loss/policy_avg": -0.2847161889076233, "lr": 8.913599182004091e-06, "objective/entropy": -436.4180908203125, "objective/kl": 17.534095764160156, "objective/non_score_reward": -1.753409504890442, "objective/rlhf_reward": -7.013638257980347, "objective/scores": 0.0, "policy/approxkl_avg": 9.336834907531738, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.609667181968689, "step": 1700, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.002042770385742 }, { "episode": 27232, "epoch": 0.4894848473954776, "loss/policy_avg": 0.4144839644432068, "lr": 8.912960122699387e-06, "objective/entropy": -390.92193603515625, "objective/kl": 39.59233093261719, "objective/non_score_reward": -3.959233283996582, "objective/rlhf_reward": -14.280674307551934, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 26.25909423828125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5924740433692932, "step": 1701, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.997622013092041 }, { "episode": 27248, "epoch": 0.48977244131286624, "loss/policy_avg": 0.8134864568710327, "lr": 8.912321063394684e-06, "objective/entropy": -385.3017578125, "objective/kl": 29.605857849121094, "objective/non_score_reward": -2.9605860710144043, "objective/rlhf_reward": -10.286084501948908, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 7.348019599914551, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7487697005271912, "step": 1702, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.998684287071228 }, { "episode": 27264, "epoch": 0.4900600352302549, "loss/policy_avg": 0.5296938419342041, "lr": 8.91168200408998e-06, "objective/entropy": -313.36285400390625, "objective/kl": 41.14372253417969, "objective/non_score_reward": -4.114372253417969, "objective/rlhf_reward": -16.457489490509033, "objective/scores": 0.0, "policy/approxkl_avg": 15.4187650680542, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7132809162139893, "step": 1703, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990894794464111 }, { "episode": 27280, "epoch": 0.4903476291476435, "loss/policy_avg": 0.09059986472129822, "lr": 8.911042944785276e-06, "objective/entropy": -409.43841552734375, "objective/kl": 32.81865692138672, "objective/non_score_reward": -3.2818660736083984, "objective/rlhf_reward": -11.611692392619783, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 26.125198364257812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6496701836585999, "step": 1704, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9995701313018799 }, { "episode": 27296, "epoch": 0.49063522306503216, "loss/policy_avg": 0.7606472969055176, "lr": 8.910403885480572e-06, "objective/entropy": -91.58116912841797, "objective/kl": 62.45122528076172, "objective/non_score_reward": -6.245121955871582, "objective/rlhf_reward": -23.376368317667563, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.585382461547852, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7571431398391724, "step": 1705, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0007095336914062 }, { "episode": 27312, "epoch": 0.4909228169824208, "loss/policy_avg": 2.4944257736206055, "lr": 8.90976482617587e-06, "objective/entropy": -296.96246337890625, "objective/kl": 45.249107360839844, "objective/non_score_reward": -4.524910926818848, "objective/rlhf_reward": -16.618691089566113, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 15.490480422973633, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5693103671073914, "step": 1706, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.000034809112549 }, { "episode": 27328, "epoch": 0.4912104108998095, "loss/policy_avg": 0.23679713904857635, "lr": 8.909125766871166e-06, "objective/entropy": -425.3541259765625, "objective/kl": 34.24750900268555, "objective/non_score_reward": -3.4247512817382812, "objective/rlhf_reward": -12.373491559058351, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.5208916664123535, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6073684692382812, "step": 1707, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9997342824935913 }, { "episode": 27344, "epoch": 0.49149800481719813, "loss/policy_avg": 0.024635761976242065, "lr": 8.908486707566463e-06, "objective/entropy": -400.0603332519531, "objective/kl": 41.86968231201172, "objective/non_score_reward": -4.1869683265686035, "objective/rlhf_reward": -15.388623440001886, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 35.488216400146484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7808965444564819, "step": 1708, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9996318817138672 }, { "episode": 27360, "epoch": 0.49178559873458677, "loss/policy_avg": -0.18475180864334106, "lr": 8.90784764826176e-06, "objective/entropy": -413.5724182128906, "objective/kl": 35.16362762451172, "objective/non_score_reward": -3.5163626670837402, "objective/rlhf_reward": -12.332117096583048, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.2768431901931763, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7634267807006836, "step": 1709, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.003754138946533 }, { "episode": 27376, "epoch": 0.4920731926519754, "loss/policy_avg": 0.38974809646606445, "lr": 8.907208588957055e-06, "objective/entropy": -411.81048583984375, "objective/kl": 37.39848327636719, "objective/non_score_reward": -3.7398486137390137, "objective/rlhf_reward": -13.29753530544101, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 0.7992995977401733, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7226402759552002, "step": 1710, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0025315284729004 }, { "episode": 27392, "epoch": 0.49236078656936405, "loss/policy_avg": -0.8558605909347534, "lr": 8.906569529652352e-06, "objective/entropy": -336.47479248046875, "objective/kl": 40.11391830444336, "objective/non_score_reward": -4.011392116546631, "objective/rlhf_reward": -13.645567989349367, "objective/scores": 0.6, "policy/approxkl_avg": 57.88031005859375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7028759121894836, "step": 1711, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.002833366394043 }, { "episode": 27408, "epoch": 0.4926483804867527, "loss/policy_avg": 0.3033882677555084, "lr": 8.905930470347649e-06, "objective/entropy": -312.1128234863281, "objective/kl": 33.98137664794922, "objective/non_score_reward": -3.3981380462646484, "objective/rlhf_reward": -12.267038974791689, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 1.868586778640747, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7824615240097046, "step": 1712, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.007772207260132 }, { "episode": 27424, "epoch": 0.49293597440414133, "loss/policy_avg": 1.3246402740478516, "lr": 8.905291411042946e-06, "objective/entropy": -366.8450622558594, "objective/kl": 28.91984748840332, "objective/non_score_reward": -2.8919849395751953, "objective/rlhf_reward": -9.620528052525458, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 21.229454040527344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.551103949546814, "step": 1713, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998268961906433 }, { "episode": 27440, "epoch": 0.49322356832153, "loss/policy_avg": -0.3467206358909607, "lr": 8.904652351738243e-06, "objective/entropy": -383.07708740234375, "objective/kl": 43.419219970703125, "objective/non_score_reward": -4.341921806335449, "objective/rlhf_reward": -14.443969879986021, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.0975329875946045, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4689633846282959, "step": 1714, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0008020401000977 }, { "episode": 27456, "epoch": 0.49351116223891867, "loss/policy_avg": 0.330936074256897, "lr": 8.90401329243354e-06, "objective/entropy": -379.84844970703125, "objective/kl": 43.714324951171875, "objective/non_score_reward": -4.371432781219482, "objective/rlhf_reward": -15.969959580691988, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 17.86842155456543, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7412660121917725, "step": 1715, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991625547409058 }, { "episode": 27472, "epoch": 0.4937987561563073, "loss/policy_avg": 0.5849929451942444, "lr": 8.903374233128835e-06, "objective/entropy": -421.6776123046875, "objective/kl": 36.75172424316406, "objective/non_score_reward": -3.6751723289489746, "objective/rlhf_reward": -13.322086908904414, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.228263854980469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7187646627426147, "step": 1716, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9988783597946167 }, { "episode": 27488, "epoch": 0.49408635007369595, "loss/policy_avg": 0.31840020418167114, "lr": 8.902735173824132e-06, "objective/entropy": -378.04498291015625, "objective/kl": 39.490936279296875, "objective/non_score_reward": -3.9490935802459717, "objective/rlhf_reward": -11.396374320983888, "objective/scores": 1.1, "policy/approxkl_avg": 39.834693908691406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.714414119720459, "step": 1717, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.998468041419983 }, { "episode": 27504, "epoch": 0.4943739439910846, "loss/policy_avg": 0.6965379118919373, "lr": 8.902096114519429e-06, "objective/entropy": -404.1169128417969, "objective/kl": 28.763225555419922, "objective/non_score_reward": -2.8763227462768555, "objective/rlhf_reward": -10.179777894049806, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 6.35676383972168, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5762016773223877, "step": 1718, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0101749897003174 }, { "episode": 27520, "epoch": 0.4946615379084732, "loss/policy_avg": 0.20047542452812195, "lr": 8.901457055214725e-06, "objective/entropy": -375.56683349609375, "objective/kl": 34.94865036010742, "objective/non_score_reward": -3.4948651790618896, "objective/rlhf_reward": -12.600858547774655, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.9575294256210327, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7747785449028015, "step": 1719, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.001783609390259 }, { "episode": 27536, "epoch": 0.49494913182586187, "loss/policy_avg": 0.38615530729293823, "lr": 8.900817995910022e-06, "objective/entropy": -356.30889892578125, "objective/kl": 30.722204208374023, "objective/non_score_reward": -3.0722203254699707, "objective/rlhf_reward": -10.166175308004888, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.290757417678833, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6604647636413574, "step": 1720, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001187562942505 }, { "episode": 27552, "epoch": 0.4952367257432505, "loss/policy_avg": 0.4275597333908081, "lr": 8.900178936605317e-06, "objective/entropy": -406.0059814453125, "objective/kl": 45.256561279296875, "objective/non_score_reward": -4.525655746459961, "objective/rlhf_reward": -16.498503956858237, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 13.624431610107422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7229255437850952, "step": 1721, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999438762664795 }, { "episode": 27568, "epoch": 0.4955243196606392, "loss/policy_avg": 0.8116226196289062, "lr": 8.899539877300614e-06, "objective/entropy": -426.2886657714844, "objective/kl": 47.309974670410156, "objective/non_score_reward": -4.730997562408447, "objective/rlhf_reward": -17.582354119330077, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 29.090354919433594, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6968857049942017, "step": 1722, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9991822242736816 }, { "episode": 27584, "epoch": 0.49581191357802784, "loss/policy_avg": 1.0673338174819946, "lr": 8.89890081799591e-06, "objective/entropy": -380.26263427734375, "objective/kl": 33.305267333984375, "objective/non_score_reward": -3.330526351928711, "objective/rlhf_reward": -11.374694655613837, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 22.04172134399414, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6245201826095581, "step": 1723, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9981802701950073 }, { "episode": 27600, "epoch": 0.4960995074954165, "loss/policy_avg": 1.5763514041900635, "lr": 8.898261758691206e-06, "objective/entropy": -410.19287109375, "objective/kl": 27.735843658447266, "objective/non_score_reward": -2.7735843658447266, "objective/rlhf_reward": -9.643738846392974, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 4.186507225036621, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6513317823410034, "step": 1724, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0116453170776367 }, { "episode": 27616, "epoch": 0.4963871014128051, "loss/policy_avg": 0.30394670367240906, "lr": 8.897622699386503e-06, "objective/entropy": -390.67291259765625, "objective/kl": 36.39476013183594, "objective/non_score_reward": -3.6394758224487305, "objective/rlhf_reward": -13.076950433667065, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 13.346420288085938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7221639156341553, "step": 1725, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.997687578201294 }, { "episode": 27632, "epoch": 0.49667469533019376, "loss/policy_avg": 0.9021917581558228, "lr": 8.8969836400818e-06, "objective/entropy": -416.77850341796875, "objective/kl": 36.998077392578125, "objective/non_score_reward": -3.69980788230896, "objective/rlhf_reward": -13.348633627505645, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 18.98017120361328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6553675532341003, "step": 1726, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9982097148895264 }, { "episode": 27648, "epoch": 0.4969622892475824, "loss/policy_avg": 0.6672807931900024, "lr": 8.896344580777097e-06, "objective/entropy": -465.71466064453125, "objective/kl": 33.220855712890625, "objective/non_score_reward": -3.3220856189727783, "objective/rlhf_reward": -11.340931366162238, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 6.40546989440918, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6033002138137817, "step": 1727, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.998972773551941 }, { "episode": 27664, "epoch": 0.49724988316497104, "loss/policy_avg": -0.35464316606521606, "lr": 8.895705521472394e-06, "objective/entropy": -410.9930114746094, "objective/kl": 30.16810417175293, "objective/non_score_reward": -3.016810417175293, "objective/rlhf_reward": -10.58628964703834, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 9.88818645477295, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5748205780982971, "step": 1728, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0097861289978027 }, { "episode": 27680, "epoch": 0.4975374770823597, "loss/policy_avg": -0.10950209200382233, "lr": 8.895066462167689e-06, "objective/entropy": -334.28167724609375, "objective/kl": 26.15772247314453, "objective/non_score_reward": -2.6157724857330322, "objective/rlhf_reward": -8.801230435789215, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 7.804541110992432, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5258300304412842, "step": 1729, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9994142055511475 }, { "episode": 27696, "epoch": 0.4978250709997484, "loss/policy_avg": 0.7887710332870483, "lr": 8.894427402862986e-06, "objective/entropy": -475.0589904785156, "objective/kl": 40.825347900390625, "objective/non_score_reward": -4.082535266876221, "objective/rlhf_reward": -14.814368808063204, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 43.226749420166016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.594218373298645, "step": 1730, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9979770183563232 }, { "episode": 27712, "epoch": 0.498112664917137, "loss/policy_avg": -0.3229426443576813, "lr": 8.893788343558283e-06, "objective/entropy": -437.2503967285156, "objective/kl": 23.355594635009766, "objective/non_score_reward": -2.335559368133545, "objective/rlhf_reward": -6.41851845824835, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.5067076683044434, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7092213034629822, "step": 1731, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.0014679431915283 }, { "episode": 27728, "epoch": 0.49840025883452566, "loss/policy_avg": 0.06289596110582352, "lr": 8.89314928425358e-06, "objective/entropy": -236.63531494140625, "objective/kl": 32.116756439208984, "objective/non_score_reward": -3.2116756439208984, "objective/rlhf_reward": -11.330930077823336, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 8.329802513122559, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.44025686383247375, "step": 1732, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000166654586792 }, { "episode": 27744, "epoch": 0.4986878527519143, "loss/policy_avg": 0.07868768274784088, "lr": 8.892510224948877e-06, "objective/entropy": -351.57421875, "objective/kl": 37.17546463012695, "objective/non_score_reward": -3.7175469398498535, "objective/rlhf_reward": -13.266067061487753, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 68.1714859008789, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.713608980178833, "step": 1733, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9989116191864014 }, { "episode": 27760, "epoch": 0.49897544666930294, "loss/policy_avg": 0.7408342361450195, "lr": 8.891871165644172e-06, "objective/entropy": -445.7039794921875, "objective/kl": 45.50661087036133, "objective/non_score_reward": -4.550661087036133, "objective/rlhf_reward": -16.861008694677977, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 28.666412353515625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6951714754104614, "step": 1734, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.999977469444275 }, { "episode": 27776, "epoch": 0.4992630405866916, "loss/policy_avg": 0.13927161693572998, "lr": 8.891232106339469e-06, "objective/entropy": -402.11627197265625, "objective/kl": 30.67918586730957, "objective/non_score_reward": -3.0679187774658203, "objective/rlhf_reward": -10.930039694815306, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.6271979808807373, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.635367751121521, "step": 1735, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9985830783843994 }, { "episode": 27792, "epoch": 0.4995506345040802, "loss/policy_avg": -0.3262159526348114, "lr": 8.890593047034766e-06, "objective/entropy": -424.4947814941406, "objective/kl": 33.12832260131836, "objective/non_score_reward": -3.3128323554992676, "objective/rlhf_reward": -11.827497322757807, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.4509100914001465, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6293026208877563, "step": 1736, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.001182794570923 }, { "episode": 27808, "epoch": 0.4998382284214689, "loss/policy_avg": 0.4612676799297333, "lr": 8.889953987730062e-06, "objective/entropy": -413.0513916015625, "objective/kl": 41.718780517578125, "objective/non_score_reward": -4.171878337860107, "objective/rlhf_reward": -15.308910229293208, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.904616355895996, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6563968658447266, "step": 1737, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.001758098602295 }, { "episode": 27824, "epoch": 0.5001258223388575, "loss/policy_avg": 1.0266878604888916, "lr": 8.88931492842536e-06, "objective/entropy": -377.7632751464844, "objective/kl": 37.8055419921875, "objective/non_score_reward": -3.7805542945861816, "objective/rlhf_reward": -13.174805710987982, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 11.085458755493164, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6420415639877319, "step": 1738, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0002970695495605 }, { "episode": 27840, "epoch": 0.5004134162562461, "loss/policy_avg": 1.7002661228179932, "lr": 8.888675869120656e-06, "objective/entropy": -415.87066650390625, "objective/kl": 50.93999481201172, "objective/non_score_reward": -5.09399938583374, "objective/rlhf_reward": -18.860225283893286, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 15.546028137207031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5488225221633911, "step": 1739, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9978928565979004 }, { "episode": 27856, "epoch": 0.5007010101736348, "loss/policy_avg": -0.5962918996810913, "lr": 8.888036809815951e-06, "objective/entropy": -418.7705383300781, "objective/kl": 41.134124755859375, "objective/non_score_reward": -4.113411903381348, "objective/rlhf_reward": -15.112012913733153, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 29.38275909423828, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.6706892251968384, "step": 1740, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.001715660095215 }, { "episode": 27872, "epoch": 0.5009886040910235, "loss/policy_avg": 3.7292261123657227, "lr": 8.887397750511248e-06, "objective/entropy": -350.2513427734375, "objective/kl": 49.07064437866211, "objective/non_score_reward": -4.907064914703369, "objective/rlhf_reward": -18.177661041827545, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 85.828369140625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7552378177642822, "step": 1741, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9995660781860352 }, { "episode": 27888, "epoch": 0.5012761980084122, "loss/policy_avg": 0.2613942623138428, "lr": 8.886758691206545e-06, "objective/entropy": -435.34014892578125, "objective/kl": 39.9609375, "objective/non_score_reward": -3.996094226837158, "objective/rlhf_reward": -11.584376430511476, "objective/scores": 1.1, "policy/approxkl_avg": 9.731155395507812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6009352207183838, "step": 1742, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999889612197876 }, { "episode": 27904, "epoch": 0.5015637919258008, "loss/policy_avg": 0.09369057416915894, "lr": 8.886119631901842e-06, "objective/entropy": -413.3562316894531, "objective/kl": 41.635623931884766, "objective/non_score_reward": -4.163562774658203, "objective/rlhf_reward": -12.254249906539918, "objective/scores": 1.1, "policy/approxkl_avg": 6.885909080505371, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7856326103210449, "step": 1743, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.0004777908325195 }, { "episode": 27920, "epoch": 0.5018513858431894, "loss/policy_avg": 0.42016851902008057, "lr": 8.885480572597139e-06, "objective/entropy": -368.1833801269531, "objective/kl": 31.248809814453125, "objective/non_score_reward": -3.1248810291290283, "objective/rlhf_reward": -11.12092194804321, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 49.13007354736328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8468366861343384, "step": 1744, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998492956161499 }, { "episode": 27936, "epoch": 0.5021389797605781, "loss/policy_avg": -0.37283024191856384, "lr": 8.884841513292434e-06, "objective/entropy": -322.2724609375, "objective/kl": 42.44226837158203, "objective/non_score_reward": -4.244227409362793, "objective/rlhf_reward": -15.61765810224859, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 8.178682327270508, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6921736001968384, "step": 1745, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.002105236053467 }, { "episode": 27952, "epoch": 0.5024265736779667, "loss/policy_avg": 1.5730620622634888, "lr": 8.884202453987731e-06, "objective/entropy": -345.21575927734375, "objective/kl": 39.659305572509766, "objective/non_score_reward": -3.965930700302124, "objective/rlhf_reward": -14.347950303348238, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 31.23470687866211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7788860201835632, "step": 1746, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000117301940918 }, { "episode": 27968, "epoch": 0.5027141675953554, "loss/policy_avg": -0.11038171499967575, "lr": 8.883563394683026e-06, "objective/entropy": -349.0570373535156, "objective/kl": 33.75109100341797, "objective/non_score_reward": -3.3751096725463867, "objective/rlhf_reward": -12.076606352527705, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 68.24091339111328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8039116859436035, "step": 1747, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0001983642578125 }, { "episode": 27984, "epoch": 0.503001761512744, "loss/policy_avg": 0.0014332067221403122, "lr": 8.882924335378323e-06, "objective/entropy": -422.93701171875, "objective/kl": 37.851226806640625, "objective/non_score_reward": -3.785122871398926, "objective/rlhf_reward": -13.017785730139288, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 3.9150452613830566, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6650782823562622, "step": 1748, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0000879764556885 }, { "episode": 28000, "epoch": 0.5032893554301326, "loss/policy_avg": 0.43949779868125916, "lr": 8.88228527607362e-06, "objective/entropy": -412.0180969238281, "objective/kl": 36.02944564819336, "objective/non_score_reward": -3.602944850921631, "objective/rlhf_reward": -14.411778688430786, "objective/scores": 0.0, "policy/approxkl_avg": 18.912006378173828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.867194414138794, "step": 1749, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9992642402648926 }, { "episode": 28016, "epoch": 0.5035769493475213, "loss/policy_avg": 0.9830820560455322, "lr": 8.881646216768917e-06, "objective/entropy": -385.3985595703125, "objective/kl": 44.38771057128906, "objective/non_score_reward": -4.4387712478637695, "objective/rlhf_reward": -15.930255527767251, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 25.891408920288086, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6953531503677368, "step": 1750, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9968655109405518 }, { "episode": 28032, "epoch": 0.5038645432649099, "loss/policy_avg": 1.2726539373397827, "lr": 8.881007157464214e-06, "objective/entropy": -439.4725646972656, "objective/kl": 26.945690155029297, "objective/non_score_reward": -2.6945691108703613, "objective/rlhf_reward": -9.419026338790339, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 38.877891540527344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8223989009857178, "step": 1751, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9976494312286377 }, { "episode": 28048, "epoch": 0.5041521371822986, "loss/policy_avg": 0.05977855622768402, "lr": 8.88036809815951e-06, "objective/entropy": -324.543212890625, "objective/kl": 37.942081451416016, "objective/non_score_reward": -3.79420804977417, "objective/rlhf_reward": -13.054126920477422, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.1964454650878906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6316072344779968, "step": 1752, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000427484512329 }, { "episode": 28064, "epoch": 0.5044397310996872, "loss/policy_avg": 1.408951997756958, "lr": 8.879729038854806e-06, "objective/entropy": -522.6439819335938, "objective/kl": 34.8803825378418, "objective/non_score_reward": -3.4880380630493164, "objective/rlhf_reward": -11.552152967453004, "objective/scores": 0.6, "policy/approxkl_avg": 61.307090759277344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6356030106544495, "step": 1753, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9989345073699951 }, { "episode": 28080, "epoch": 0.5047273250170758, "loss/policy_avg": 1.060495376586914, "lr": 8.879089979550103e-06, "objective/entropy": -375.7655944824219, "objective/kl": 32.8582878112793, "objective/non_score_reward": -3.2858288288116455, "objective/rlhf_reward": -11.627543532642063, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 32.1416015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7518796920776367, "step": 1754, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9971826076507568 }, { "episode": 28096, "epoch": 0.5050149189344645, "loss/policy_avg": -0.482033908367157, "lr": 8.8784509202454e-06, "objective/entropy": -399.4933166503906, "objective/kl": 44.83428192138672, "objective/non_score_reward": -4.483428001403809, "objective/rlhf_reward": -16.200380341211954, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 0.9191845655441284, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7084457874298096, "step": 1755, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000138282775879 }, { "episode": 28112, "epoch": 0.5053025128518532, "loss/policy_avg": 0.9278706312179565, "lr": 8.877811860940696e-06, "objective/entropy": -417.2349853515625, "objective/kl": 45.385623931884766, "objective/non_score_reward": -4.538562774658203, "objective/rlhf_reward": -16.70365272006546, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 43.58812713623047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6438326835632324, "step": 1756, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9988031387329102 }, { "episode": 28128, "epoch": 0.5055901067692419, "loss/policy_avg": 1.8003909587860107, "lr": 8.877172801635993e-06, "objective/entropy": -376.5373840332031, "objective/kl": 42.20722198486328, "objective/non_score_reward": -4.220722198486328, "objective/rlhf_reward": -15.459057409961787, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 4.025523662567139, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.598468542098999, "step": 1757, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9987118244171143 }, { "episode": 28144, "epoch": 0.5058777006866305, "loss/policy_avg": 0.03465745970606804, "lr": 8.876533742331288e-06, "objective/entropy": -375.940673828125, "objective/kl": 34.30274200439453, "objective/non_score_reward": -3.43027400970459, "objective/rlhf_reward": -12.270498137088165, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 3.8771886825561523, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5058436989784241, "step": 1758, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9991415739059448 }, { "episode": 28160, "epoch": 0.5061652946040192, "loss/policy_avg": 0.08600963652133942, "lr": 8.875894683026585e-06, "objective/entropy": -460.02734375, "objective/kl": 29.18838119506836, "objective/non_score_reward": -2.9188380241394043, "objective/rlhf_reward": -10.159580552371677, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 6.603547096252441, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7612677216529846, "step": 1759, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0072059631347656 }, { "episode": 28176, "epoch": 0.5064528885214078, "loss/policy_avg": 0.3010416626930237, "lr": 8.875255623721882e-06, "objective/entropy": -385.99066162109375, "objective/kl": 46.79908752441406, "objective/non_score_reward": -4.679908752441406, "objective/rlhf_reward": -17.26903758487259, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 12.585357666015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.653283953666687, "step": 1760, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9991579055786133 }, { "episode": 28192, "epoch": 0.5067404824387964, "loss/policy_avg": 0.5261976718902588, "lr": 8.874616564417179e-06, "objective/entropy": -438.08740234375, "objective/kl": 38.64686584472656, "objective/non_score_reward": -3.8646864891052246, "objective/rlhf_reward": -11.0587459564209, "objective/scores": 1.1, "policy/approxkl_avg": 12.90796184539795, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6889891624450684, "step": 1761, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9995412826538086 }, { "episode": 28208, "epoch": 0.5070280763561851, "loss/policy_avg": 0.1498948484659195, "lr": 8.873977505112476e-06, "objective/entropy": -313.9883117675781, "objective/kl": 25.73459243774414, "objective/non_score_reward": -2.5734596252441406, "objective/rlhf_reward": -8.934588277076168, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 19.848987579345703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.711419939994812, "step": 1762, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9975435733795166 }, { "episode": 28224, "epoch": 0.5073156702735737, "loss/policy_avg": 0.5180165767669678, "lr": 8.873338445807773e-06, "objective/entropy": -380.669921875, "objective/kl": 42.19300079345703, "objective/non_score_reward": -4.219300270080566, "objective/rlhf_reward": -13.953481350780699, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 13.501199722290039, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6426799297332764, "step": 1763, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9978431463241577 }, { "episode": 28240, "epoch": 0.5076032641909624, "loss/policy_avg": -0.11735323816537857, "lr": 8.872699386503068e-06, "objective/entropy": -442.0233154296875, "objective/kl": 39.99811935424805, "objective/non_score_reward": -3.999812126159668, "objective/rlhf_reward": -14.548650126071319, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 38.36247253417969, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.627625584602356, "step": 1764, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.0046586990356445 }, { "episode": 28256, "epoch": 0.507890858108351, "loss/policy_avg": 0.5731643438339233, "lr": 8.872060327198365e-06, "objective/entropy": -430.72393798828125, "objective/kl": 38.005638122558594, "objective/non_score_reward": -3.8005638122558594, "objective/rlhf_reward": -13.84300514433233, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 7.558406829833984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6510865688323975, "step": 1765, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.997239351272583 }, { "episode": 28272, "epoch": 0.5081784520257396, "loss/policy_avg": 1.3579412698745728, "lr": 8.871421267893662e-06, "objective/entropy": -429.132568359375, "objective/kl": 41.63458251953125, "objective/non_score_reward": -4.163458347320557, "objective/rlhf_reward": -15.049713406626303, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 11.494589805603027, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5494046211242676, "step": 1766, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0013632774353027 }, { "episode": 28288, "epoch": 0.5084660459431283, "loss/policy_avg": -0.14237716794013977, "lr": 8.870782208588959e-06, "objective/entropy": -397.9531555175781, "objective/kl": 32.58894348144531, "objective/non_score_reward": -3.2588939666748047, "objective/rlhf_reward": -11.676326238845272, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 13.734619140625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7580890655517578, "step": 1767, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0007190704345703 }, { "episode": 28304, "epoch": 0.5087536398605169, "loss/policy_avg": 0.21118766069412231, "lr": 8.870143149284254e-06, "objective/entropy": -409.110595703125, "objective/kl": 34.78706359863281, "objective/non_score_reward": -3.4787063598632812, "objective/rlhf_reward": -13.914825439453125, "objective/scores": 0.0, "policy/approxkl_avg": 36.42051696777344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6058590412139893, "step": 1768, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998093843460083 }, { "episode": 28320, "epoch": 0.5090412337779056, "loss/policy_avg": 0.14778955280780792, "lr": 8.86950408997955e-06, "objective/entropy": -375.5557556152344, "objective/kl": 39.117340087890625, "objective/non_score_reward": -3.911733865737915, "objective/rlhf_reward": -13.913602368036905, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 9.080873489379883, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7022272348403931, "step": 1769, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9978837966918945 }, { "episode": 28336, "epoch": 0.5093288276952942, "loss/policy_avg": 0.6867684125900269, "lr": 8.868865030674848e-06, "objective/entropy": -395.05401611328125, "objective/kl": 40.4748649597168, "objective/non_score_reward": -4.047486782073975, "objective/rlhf_reward": -14.067240419165167, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 17.31562042236328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6878872513771057, "step": 1770, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9984920024871826 }, { "episode": 28352, "epoch": 0.509616421612683, "loss/policy_avg": 0.054829925298690796, "lr": 8.868225971370143e-06, "objective/entropy": -269.9557800292969, "objective/kl": 50.00763702392578, "objective/non_score_reward": -5.000763893127441, "objective/rlhf_reward": -17.07933560454962, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.8336164951324463, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46249258518218994, "step": 1771, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999645709991455 }, { "episode": 28368, "epoch": 0.5099040155300716, "loss/policy_avg": -0.21850018203258514, "lr": 8.86758691206544e-06, "objective/entropy": -457.8492431640625, "objective/kl": 38.19148254394531, "objective/non_score_reward": -3.819148540496826, "objective/rlhf_reward": -13.451766367229531, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 32.967803955078125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.48317936062812805, "step": 1772, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0044105052948 }, { "episode": 28384, "epoch": 0.5101916094474602, "loss/policy_avg": -0.2307959944009781, "lr": 8.866947852760737e-06, "objective/entropy": -415.6637268066406, "objective/kl": 36.802452087402344, "objective/non_score_reward": -3.6802453994750977, "objective/rlhf_reward": -12.773570249752936, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.94115149974823, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.83992600440979, "step": 1773, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 2.0008411407470703 }, { "episode": 28400, "epoch": 0.5104792033648489, "loss/policy_avg": 0.41877320408821106, "lr": 8.866308793456033e-06, "objective/entropy": -367.9455871582031, "objective/kl": 35.76533889770508, "objective/non_score_reward": -3.5765342712402344, "objective/rlhf_reward": -12.481308098110269, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 6.875262260437012, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6272455453872681, "step": 1774, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0013580322265625 }, { "episode": 28416, "epoch": 0.5107667972822375, "loss/policy_avg": 1.2223483324050903, "lr": 8.86566973415133e-06, "objective/entropy": -359.39984130859375, "objective/kl": 36.68757629394531, "objective/non_score_reward": -3.668757677078247, "objective/rlhf_reward": -13.070910964075642, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.9005379676818848, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6889514923095703, "step": 1775, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0009469985961914 }, { "episode": 28432, "epoch": 0.5110543911996261, "loss/policy_avg": 0.05632822960615158, "lr": 8.865030674846627e-06, "objective/entropy": -444.615966796875, "objective/kl": 35.7459716796875, "objective/non_score_reward": -3.574596881866455, "objective/rlhf_reward": -12.636528497160064, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.715425968170166, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6466243267059326, "step": 1776, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000438928604126 }, { "episode": 28448, "epoch": 0.5113419851170148, "loss/policy_avg": -0.10230511426925659, "lr": 8.864391615541922e-06, "objective/entropy": -277.28240966796875, "objective/kl": 39.28502655029297, "objective/non_score_reward": -3.9285027980804443, "objective/rlhf_reward": -14.335408547011713, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 31.657413482666016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.795226514339447, "step": 1777, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0056231021881104 }, { "episode": 28464, "epoch": 0.5116295790344034, "loss/policy_avg": -0.19402796030044556, "lr": 8.86375255623722e-06, "objective/entropy": -444.5499572753906, "objective/kl": 30.017902374267578, "objective/non_score_reward": -3.0017900466918945, "objective/rlhf_reward": -10.607160902023315, "objective/scores": 0.35, "policy/approxkl_avg": 0.7446051239967346, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5373311042785645, "step": 1778, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0002200603485107 }, { "episode": 28480, "epoch": 0.5119171729517921, "loss/policy_avg": 0.47494810819625854, "lr": 8.863113496932516e-06, "objective/entropy": -400.908935546875, "objective/kl": 36.46863555908203, "objective/non_score_reward": -3.6468634605407715, "objective/rlhf_reward": -13.20885191210876, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.232701063156128, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7144824862480164, "step": 1779, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9996256828308105 }, { "episode": 28496, "epoch": 0.5122047668691807, "loss/policy_avg": 0.6276746988296509, "lr": 8.862474437627813e-06, "objective/entropy": -382.78131103515625, "objective/kl": 35.647369384765625, "objective/non_score_reward": -3.564737319946289, "objective/rlhf_reward": -9.858949518203737, "objective/scores": 1.1, "policy/approxkl_avg": 17.388185501098633, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7432838678359985, "step": 1780, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9992676973342896 }, { "episode": 28512, "epoch": 0.5124923607865693, "loss/policy_avg": 0.8599903583526611, "lr": 8.86183537832311e-06, "objective/entropy": -434.1091003417969, "objective/kl": 39.21890640258789, "objective/non_score_reward": -3.9218907356262207, "objective/rlhf_reward": -14.362050566703005, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 5.121826171875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7468260526657104, "step": 1781, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000609874725342 }, { "episode": 28528, "epoch": 0.512779954703958, "loss/policy_avg": 0.07229068875312805, "lr": 8.861196319018405e-06, "objective/entropy": -383.1351318359375, "objective/kl": 37.84386444091797, "objective/non_score_reward": -3.78438663482666, "objective/rlhf_reward": -10.737546300888063, "objective/scores": 1.1, "policy/approxkl_avg": 1.079226016998291, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.699774444103241, "step": 1782, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0003957748413086 }, { "episode": 28544, "epoch": 0.5130675486213466, "loss/policy_avg": -0.18846699595451355, "lr": 8.860557259713702e-06, "objective/entropy": -387.8876953125, "objective/kl": 29.544404983520508, "objective/non_score_reward": -2.9544405937194824, "objective/rlhf_reward": -10.213642273012717, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 9.303590774536133, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7668851613998413, "step": 1783, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.00390625 }, { "episode": 28560, "epoch": 0.5133551425387353, "loss/policy_avg": 0.06743152439594269, "lr": 8.859918200408999e-06, "objective/entropy": -373.4194030761719, "objective/kl": 39.27418518066406, "objective/non_score_reward": -3.9274187088012695, "objective/rlhf_reward": -14.259076456637725, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.983701705932617, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5661546587944031, "step": 1784, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000019073486328 }, { "episode": 28576, "epoch": 0.5136427364561239, "loss/policy_avg": 0.1927054524421692, "lr": 8.859279141104296e-06, "objective/entropy": -436.22100830078125, "objective/kl": 33.516143798828125, "objective/non_score_reward": -3.351614475250244, "objective/rlhf_reward": -11.58162843731315, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 1.4211245775222778, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7223166823387146, "step": 1785, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0001449584960938 }, { "episode": 28592, "epoch": 0.5139303303735125, "loss/policy_avg": 0.7831727266311646, "lr": 8.858640081799593e-06, "objective/entropy": -336.43060302734375, "objective/kl": 35.12346267700195, "objective/non_score_reward": -3.5123462677001953, "objective/rlhf_reward": -12.224555726322244, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 17.671064376831055, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6742507219314575, "step": 1786, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0002756118774414 }, { "episode": 28608, "epoch": 0.5142179242909013, "loss/policy_avg": -0.6096823215484619, "lr": 8.85800102249489e-06, "objective/entropy": -327.6683654785156, "objective/kl": 33.464019775390625, "objective/non_score_reward": -3.3464019298553467, "objective/rlhf_reward": -11.262901248709234, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 16.13820457458496, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7356235384941101, "step": 1787, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9986097812652588 }, { "episode": 28624, "epoch": 0.5145055182082899, "loss/policy_avg": 0.22873713076114655, "lr": 8.857361963190185e-06, "objective/entropy": -416.158935546875, "objective/kl": 32.825904846191406, "objective/non_score_reward": -3.282590866088867, "objective/rlhf_reward": -10.20664421165106, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 6.251375675201416, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6167716979980469, "step": 1788, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.999826431274414 }, { "episode": 28640, "epoch": 0.5147931121256786, "loss/policy_avg": 0.39884504675865173, "lr": 8.856722903885481e-06, "objective/entropy": -387.8585205078125, "objective/kl": 40.225120544433594, "objective/non_score_reward": -4.0225114822387695, "objective/rlhf_reward": -14.66621526022729, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.8301403522491455, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9147070646286011, "step": 1789, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.998860239982605 }, { "episode": 28656, "epoch": 0.5150807060430672, "loss/policy_avg": 0.0943792462348938, "lr": 8.856083844580777e-06, "objective/entropy": -478.66595458984375, "objective/kl": 29.81654930114746, "objective/non_score_reward": -2.9816551208496094, "objective/rlhf_reward": -9.52662036418915, "objective/scores": 0.6, "policy/approxkl_avg": 0.8293582797050476, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7382746338844299, "step": 1790, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.999783992767334 }, { "episode": 28672, "epoch": 0.5153682999604559, "loss/policy_avg": 1.064944863319397, "lr": 8.855444785276074e-06, "objective/entropy": -337.23516845703125, "objective/kl": 37.60517883300781, "objective/non_score_reward": -3.7605180740356445, "objective/rlhf_reward": -13.380213265836822, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 56.2615852355957, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6233469247817993, "step": 1791, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0007376670837402 }, { "episode": 28688, "epoch": 0.5156558938778445, "loss/policy_avg": 0.4659278988838196, "lr": 8.85480572597137e-06, "objective/entropy": -380.7203369140625, "objective/kl": 49.99015808105469, "objective/non_score_reward": -4.999015808105469, "objective/rlhf_reward": -18.596063709259035, "objective/scores": 0.35, "policy/approxkl_avg": 11.835033416748047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8537371754646301, "step": 1792, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9970762729644775 }, { "episode": 28704, "epoch": 0.5159434877952331, "loss/policy_avg": 0.21428313851356506, "lr": 8.854166666666667e-06, "objective/entropy": -383.1554870605469, "objective/kl": 29.598880767822266, "objective/non_score_reward": -2.959888219833374, "objective/rlhf_reward": -9.716846647039924, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 6.110344886779785, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7900140285491943, "step": 1793, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9975876808166504 }, { "episode": 28720, "epoch": 0.5162310817126218, "loss/policy_avg": 0.4651057720184326, "lr": 8.853527607361964e-06, "objective/entropy": -429.013427734375, "objective/kl": 39.36705780029297, "objective/non_score_reward": -3.9367058277130127, "objective/rlhf_reward": -14.368221142379145, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.5950591564178467, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6544770002365112, "step": 1794, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0011582374572754 }, { "episode": 28736, "epoch": 0.5165186756300104, "loss/policy_avg": -0.8366875648498535, "lr": 8.85288854805726e-06, "objective/entropy": -358.90576171875, "objective/kl": 32.215084075927734, "objective/non_score_reward": -3.221508264541626, "objective/rlhf_reward": -11.061204667362283, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 3.7713708877563477, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7476552724838257, "step": 1795, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9999535083770752 }, { "episode": 28752, "epoch": 0.516806269547399, "loss/policy_avg": 0.22891204059123993, "lr": 8.852249488752556e-06, "objective/entropy": -414.8268127441406, "objective/kl": 37.558433532714844, "objective/non_score_reward": -3.755843162536621, "objective/rlhf_reward": -13.57277450999771, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 19.94131851196289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6356815099716187, "step": 1796, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9976699352264404 }, { "episode": 28768, "epoch": 0.5170938634647877, "loss/policy_avg": -0.062338367104530334, "lr": 8.851610429447853e-06, "objective/entropy": -414.5196533203125, "objective/kl": 43.42176055908203, "objective/non_score_reward": -4.342175483703613, "objective/rlhf_reward": -15.245996417776617, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 21.672277450561523, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6863969564437866, "step": 1797, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.997234582901001 }, { "episode": 28784, "epoch": 0.5173814573821763, "loss/policy_avg": -0.2489970177412033, "lr": 8.85097137014315e-06, "objective/entropy": -373.1884765625, "objective/kl": 26.106861114501953, "objective/non_score_reward": -2.6106860637664795, "objective/rlhf_reward": -9.018912036617365, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.2320964336395264, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6770190000534058, "step": 1798, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0009703636169434 }, { "episode": 28800, "epoch": 0.517669051299565, "loss/policy_avg": -0.3858693242073059, "lr": 8.850332310838447e-06, "objective/entropy": -433.9825439453125, "objective/kl": 30.610628128051758, "objective/non_score_reward": -3.061063051223755, "objective/rlhf_reward": -10.918739113837404, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.815208911895752, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8374090194702148, "step": 1799, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9997122287750244 }, { "episode": 28816, "epoch": 0.5179566452169536, "loss/policy_avg": 4.439436435699463, "lr": 8.849693251533744e-06, "objective/entropy": -430.10662841796875, "objective/kl": 33.91274642944336, "objective/non_score_reward": -3.3912744522094727, "objective/rlhf_reward": -12.239585433036012, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 2.765347719192505, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7089810371398926, "step": 1800, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000128984451294 }, { "episode": 28832, "epoch": 0.5182442391343423, "loss/policy_avg": 0.6766610145568848, "lr": 8.849054192229039e-06, "objective/entropy": -367.77606201171875, "objective/kl": 36.89229202270508, "objective/non_score_reward": -3.6892290115356445, "objective/rlhf_reward": -12.634209575430425, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 87.1373291015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6905878782272339, "step": 1801, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.997178554534912 }, { "episode": 28848, "epoch": 0.518531833051731, "loss/policy_avg": 1.1133520603179932, "lr": 8.848415132924336e-06, "objective/entropy": -408.41680908203125, "objective/kl": 30.455432891845703, "objective/non_score_reward": -3.0455434322357178, "objective/rlhf_reward": -10.357344980510781, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.8278703689575195, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5565105676651001, "step": 1802, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9965697526931763 }, { "episode": 28864, "epoch": 0.5188194269691196, "loss/policy_avg": 0.47599565982818604, "lr": 8.847776073619633e-06, "objective/entropy": -400.8670654296875, "objective/kl": 34.5140266418457, "objective/non_score_reward": -3.4514026641845703, "objective/rlhf_reward": -11.40561113357544, "objective/scores": 0.6, "policy/approxkl_avg": 118.000732421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7566095590591431, "step": 1803, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9993181228637695 }, { "episode": 28880, "epoch": 0.5191070208865083, "loss/policy_avg": -0.2508164346218109, "lr": 8.84713701431493e-06, "objective/entropy": -338.53399658203125, "objective/kl": 33.789756774902344, "objective/non_score_reward": -3.3789753913879395, "objective/rlhf_reward": -11.911782298151572, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.6583428382873535, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7076740264892578, "step": 1804, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000040054321289 }, { "episode": 28896, "epoch": 0.5193946148038969, "loss/policy_avg": 0.13225993514060974, "lr": 8.846497955010226e-06, "objective/entropy": -390.15869140625, "objective/kl": 23.06563949584961, "objective/non_score_reward": -2.3065638542175293, "objective/rlhf_reward": -7.826256132125854, "objective/scores": 0.35, "policy/approxkl_avg": 5.136684417724609, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7140323519706726, "step": 1805, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9987881183624268 }, { "episode": 28912, "epoch": 0.5196822087212856, "loss/policy_avg": 0.33939534425735474, "lr": 8.845858895705522e-06, "objective/entropy": -428.7296142578125, "objective/kl": 42.57624053955078, "objective/non_score_reward": -4.257624626159668, "objective/rlhf_reward": -15.426377329889853, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 49.87748718261719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6631081104278564, "step": 1806, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9995882511138916 }, { "episode": 28928, "epoch": 0.5199698026386742, "loss/policy_avg": 0.25259172916412354, "lr": 8.845219836400819e-06, "objective/entropy": -260.5114440917969, "objective/kl": 38.17537307739258, "objective/non_score_reward": -3.817537307739258, "objective/rlhf_reward": -13.666028950277884, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.170551300048828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6624091863632202, "step": 1807, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.999018669128418 }, { "episode": 28944, "epoch": 0.5202573965560628, "loss/policy_avg": -0.5341210961341858, "lr": 8.844580777096115e-06, "objective/entropy": -342.8121643066406, "objective/kl": 38.384029388427734, "objective/non_score_reward": -3.83840274810791, "objective/rlhf_reward": -13.87265932839668, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.9913593530654907, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6562291979789734, "step": 1808, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0078423023223877 }, { "episode": 28960, "epoch": 0.5205449904734515, "loss/policy_avg": 0.9154263138771057, "lr": 8.843941717791412e-06, "objective/entropy": -425.54608154296875, "objective/kl": 41.25331115722656, "objective/non_score_reward": -4.125330924987793, "objective/rlhf_reward": -14.985551917346651, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.623509407043457, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6934017539024353, "step": 1809, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9996366500854492 }, { "episode": 28976, "epoch": 0.5208325843908401, "loss/policy_avg": 0.4642972946166992, "lr": 8.84330265848671e-06, "objective/entropy": -404.79730224609375, "objective/kl": 35.159324645996094, "objective/non_score_reward": -3.515932321548462, "objective/rlhf_reward": -12.238900537761758, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 4.052479267120361, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6886200904846191, "step": 1810, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999322772026062 }, { "episode": 28992, "epoch": 0.5211201783082288, "loss/policy_avg": 0.8925999402999878, "lr": 8.842663599182006e-06, "objective/entropy": -391.0741271972656, "objective/kl": 46.49000549316406, "objective/non_score_reward": -4.649000644683838, "objective/rlhf_reward": -17.236752712462824, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 8.975879669189453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6933857202529907, "step": 1811, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9989970922470093 }, { "episode": 29008, "epoch": 0.5214077722256174, "loss/policy_avg": -0.06674160808324814, "lr": 8.842024539877301e-06, "objective/entropy": -353.45404052734375, "objective/kl": 40.15724182128906, "objective/non_score_reward": -4.015724182128906, "objective/rlhf_reward": -14.639064629276362, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.411278247833252, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5892839431762695, "step": 1812, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.000232696533203 }, { "episode": 29024, "epoch": 0.521695366143006, "loss/policy_avg": 0.24771270155906677, "lr": 8.841385480572598e-06, "objective/entropy": -385.57427978515625, "objective/kl": 23.51211929321289, "objective/non_score_reward": -2.3512120246887207, "objective/rlhf_reward": -8.004848337173462, "objective/scores": 0.35, "policy/approxkl_avg": 1.991456151008606, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6382764577865601, "step": 1813, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9996665716171265 }, { "episode": 29040, "epoch": 0.5219829600603947, "loss/policy_avg": 1.1466610431671143, "lr": 8.840746421267893e-06, "objective/entropy": -430.94549560546875, "objective/kl": 37.720699310302734, "objective/non_score_reward": -3.7720699310302734, "objective/rlhf_reward": -15.088280439376831, "objective/scores": 0.0, "policy/approxkl_avg": 1.6916029453277588, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7269685864448547, "step": 1814, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.005490779876709 }, { "episode": 29056, "epoch": 0.5222705539777833, "loss/policy_avg": 0.35351186990737915, "lr": 8.84010736196319e-06, "objective/entropy": -419.12939453125, "objective/kl": 34.62811279296875, "objective/non_score_reward": -3.4628114700317383, "objective/rlhf_reward": -12.427414257724848, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.4672188758850098, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7362732887268066, "step": 1815, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0051357746124268 }, { "episode": 29072, "epoch": 0.522558147895172, "loss/policy_avg": -0.11501419544219971, "lr": 8.839468302658487e-06, "objective/entropy": -412.73345947265625, "objective/kl": 28.61923599243164, "objective/non_score_reward": -2.8619236946105957, "objective/rlhf_reward": -11.447694540023804, "objective/scores": 0.0, "policy/approxkl_avg": 4.582057952880859, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6852255463600159, "step": 1816, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9996578693389893 }, { "episode": 29088, "epoch": 0.5228457418125607, "loss/policy_avg": 0.30757230520248413, "lr": 8.838829243353784e-06, "objective/entropy": -416.80963134765625, "objective/kl": 32.740966796875, "objective/non_score_reward": -3.2740964889526367, "objective/rlhf_reward": -11.754750063925414, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 16.353458404541016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7094159126281738, "step": 1817, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9986659288406372 }, { "episode": 29104, "epoch": 0.5231333357299494, "loss/policy_avg": -0.1926991194486618, "lr": 8.83819018404908e-06, "objective/entropy": -411.597900390625, "objective/kl": 40.67694091796875, "objective/non_score_reward": -4.067694187164307, "objective/rlhf_reward": -11.870776987075805, "objective/scores": 1.1, "policy/approxkl_avg": 2.2110366821289062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6443576812744141, "step": 1818, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0000925064086914 }, { "episode": 29120, "epoch": 0.523420929647338, "loss/policy_avg": 0.3801366686820984, "lr": 8.837551124744376e-06, "objective/entropy": -383.106689453125, "objective/kl": 40.06800842285156, "objective/non_score_reward": -4.006800651550293, "objective/rlhf_reward": -14.511430823596651, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 30.051219940185547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.817169189453125, "step": 1819, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.998051404953003 }, { "episode": 29136, "epoch": 0.5237085235647266, "loss/policy_avg": 0.55712890625, "lr": 8.836912065439673e-06, "objective/entropy": -384.9119567871094, "objective/kl": 46.39973449707031, "objective/non_score_reward": -4.639974117279053, "objective/rlhf_reward": -17.0036366870075, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 67.28683471679688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7009698748588562, "step": 1820, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.001417636871338 }, { "episode": 29152, "epoch": 0.5239961174821153, "loss/policy_avg": -0.044126320630311966, "lr": 8.83627300613497e-06, "objective/entropy": -350.6741638183594, "objective/kl": 44.03277587890625, "objective/non_score_reward": -4.403277397155762, "objective/rlhf_reward": -16.189277727802363, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.722665309906006, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6550916433334351, "step": 1821, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.006425380706787 }, { "episode": 29168, "epoch": 0.5242837113995039, "loss/policy_avg": 0.47921860218048096, "lr": 8.835633946830267e-06, "objective/entropy": -422.56585693359375, "objective/kl": 38.356788635253906, "objective/non_score_reward": -3.835678815841675, "objective/rlhf_reward": -10.942715740203857, "objective/scores": 1.1, "policy/approxkl_avg": 37.65925598144531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5983825922012329, "step": 1822, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9987446069717407 }, { "episode": 29184, "epoch": 0.5245713053168926, "loss/policy_avg": 0.640871524810791, "lr": 8.834994887525563e-06, "objective/entropy": -429.0748291015625, "objective/kl": 43.26716232299805, "objective/non_score_reward": -4.32671594619751, "objective/rlhf_reward": -15.645004516065704, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.222081184387207, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6358453631401062, "step": 1823, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990040063858032 }, { "episode": 29200, "epoch": 0.5248588992342812, "loss/policy_avg": -0.07319626212120056, "lr": 8.83435582822086e-06, "objective/entropy": -423.3052978515625, "objective/kl": 36.56036376953125, "objective/non_score_reward": -3.656036853790283, "objective/rlhf_reward": -13.24554453143249, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.9003111124038696, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7399861812591553, "step": 1824, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0007081031799316 }, { "episode": 29216, "epoch": 0.5251464931516698, "loss/policy_avg": 1.6112968921661377, "lr": 8.833716768916156e-06, "objective/entropy": -444.9851379394531, "objective/kl": 36.0670051574707, "objective/non_score_reward": -3.6067006587982178, "objective/rlhf_reward": -12.026802396774292, "objective/scores": 0.6, "policy/approxkl_avg": 1.705216646194458, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6899057626724243, "step": 1825, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9996836185455322 }, { "episode": 29232, "epoch": 0.5254340870690585, "loss/policy_avg": 0.9066250324249268, "lr": 8.833077709611452e-06, "objective/entropy": -380.48492431640625, "objective/kl": 41.267059326171875, "objective/non_score_reward": -4.126706123352051, "objective/rlhf_reward": -14.384119214788946, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 3.3713717460632324, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.6601887941360474, "step": 1826, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0004353523254395 }, { "episode": 29248, "epoch": 0.5257216809864471, "loss/policy_avg": 0.43188661336898804, "lr": 8.83243865030675e-06, "objective/entropy": -378.43994140625, "objective/kl": 30.579330444335938, "objective/non_score_reward": -3.0579333305358887, "objective/rlhf_reward": -10.872483932708187, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 76.67919921875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6789950132369995, "step": 1827, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9990952014923096 }, { "episode": 29264, "epoch": 0.5260092749038358, "loss/policy_avg": 0.265228807926178, "lr": 8.831799591002046e-06, "objective/entropy": -431.779052734375, "objective/kl": 30.470809936523438, "objective/non_score_reward": -3.047081232070923, "objective/rlhf_reward": -10.809722640601496, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 11.598209381103516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7093855738639832, "step": 1828, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.998291015625 }, { "episode": 29280, "epoch": 0.5262968688212244, "loss/policy_avg": 0.6392818689346313, "lr": 8.831160531697343e-06, "objective/entropy": -369.1026611328125, "objective/kl": 45.41571807861328, "objective/non_score_reward": -4.541572093963623, "objective/rlhf_reward": -18.166287779808044, "objective/scores": 0.0, "policy/approxkl_avg": 4.598066806793213, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.530470609664917, "step": 1829, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9980814456939697 }, { "episode": 29296, "epoch": 0.526584462738613, "loss/policy_avg": -0.028046652674674988, "lr": 8.83052147239264e-06, "objective/entropy": -379.852294921875, "objective/kl": 32.9768180847168, "objective/non_score_reward": -3.2976818084716797, "objective/rlhf_reward": -11.068021240011726, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 5.24629020690918, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.9729580879211426, "step": 1830, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000474214553833 }, { "episode": 29312, "epoch": 0.5268720566560017, "loss/policy_avg": -0.09381988644599915, "lr": 8.829882413087935e-06, "objective/entropy": -372.1632995605469, "objective/kl": 31.014314651489258, "objective/non_score_reward": -3.101431369781494, "objective/rlhf_reward": -10.80160597330721, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.6243351697921753, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5825165510177612, "step": 1831, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0001912117004395 }, { "episode": 29328, "epoch": 0.5271596505733904, "loss/policy_avg": 1.057373285293579, "lr": 8.829243353783232e-06, "objective/entropy": -377.90179443359375, "objective/kl": 36.308006286621094, "objective/non_score_reward": -3.630800247192383, "objective/rlhf_reward": -13.181566050558715, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 35.253868103027344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5869263410568237, "step": 1832, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0008511543273926 }, { "episode": 29344, "epoch": 0.5274472444907791, "loss/policy_avg": 3.6712849140167236, "lr": 8.828604294478529e-06, "objective/entropy": -432.9620666503906, "objective/kl": 31.893817901611328, "objective/non_score_reward": -3.1893820762634277, "objective/rlhf_reward": -11.024194733301798, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 1.5733556747436523, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6838439702987671, "step": 1833, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000973701477051 }, { "episode": 29360, "epoch": 0.5277348384081677, "loss/policy_avg": 0.3517574667930603, "lr": 8.827965235173824e-06, "objective/entropy": -417.96051025390625, "objective/kl": 35.22224807739258, "objective/non_score_reward": -3.5222249031066895, "objective/rlhf_reward": -12.665067036350337, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 30.191883087158203, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.711654543876648, "step": 1834, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9992337226867676 }, { "episode": 29376, "epoch": 0.5280224323255563, "loss/policy_avg": 0.31956470012664795, "lr": 8.827326175869121e-06, "objective/entropy": -423.5634765625, "objective/kl": 36.68805694580078, "objective/non_score_reward": -3.6688058376312256, "objective/rlhf_reward": -13.224625448794708, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.307436227798462, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6479382514953613, "step": 1835, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998690128326416 }, { "episode": 29392, "epoch": 0.528310026242945, "loss/policy_avg": 0.28511127829551697, "lr": 8.826687116564418e-06, "objective/entropy": -460.23846435546875, "objective/kl": 30.798168182373047, "objective/non_score_reward": -3.0798168182373047, "objective/rlhf_reward": -10.868669132800445, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 41.211090087890625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7127453088760376, "step": 1836, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9977433681488037 }, { "episode": 29408, "epoch": 0.5285976201603336, "loss/policy_avg": 0.1020292341709137, "lr": 8.826048057259715e-06, "objective/entropy": -407.5755920410156, "objective/kl": 32.515830993652344, "objective/non_score_reward": -3.2515830993652344, "objective/rlhf_reward": -11.627730467406613, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 38.7227897644043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.705349326133728, "step": 1837, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9998595714569092 }, { "episode": 29424, "epoch": 0.5288852140777223, "loss/policy_avg": 0.7309010624885559, "lr": 8.82540899795501e-06, "objective/entropy": -418.5655212402344, "objective/kl": 35.60040283203125, "objective/non_score_reward": -3.56003999710083, "objective/rlhf_reward": -11.31644133174536, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 19.418109893798828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8147119879722595, "step": 1838, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9975545406341553 }, { "episode": 29440, "epoch": 0.5291728079951109, "loss/policy_avg": 0.41623181104660034, "lr": 8.824769938650307e-06, "objective/entropy": -407.2655029296875, "objective/kl": 37.855628967285156, "objective/non_score_reward": -3.7855629920959473, "objective/rlhf_reward": -13.538132224146445, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.345246315002441, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6792882680892944, "step": 1839, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9987549781799316 }, { "episode": 29456, "epoch": 0.5294604019124995, "loss/policy_avg": 0.6483747959136963, "lr": 8.824130879345604e-06, "objective/entropy": -450.36065673828125, "objective/kl": 36.293216705322266, "objective/non_score_reward": -3.629321575164795, "objective/rlhf_reward": -11.59356728637335, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.0351998805999756, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.763587474822998, "step": 1840, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0031847953796387 }, { "episode": 29472, "epoch": 0.5297479958298882, "loss/policy_avg": -0.16719195246696472, "lr": 8.8234918200409e-06, "objective/entropy": -414.3216552734375, "objective/kl": 40.14808654785156, "objective/non_score_reward": -4.014808654785156, "objective/rlhf_reward": -14.45511511332186, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.1850852966308594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6265361309051514, "step": 1841, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0015580654144287 }, { "episode": 29488, "epoch": 0.5300355897472768, "loss/policy_avg": -0.455500066280365, "lr": 8.822852760736197e-06, "objective/entropy": -425.56231689453125, "objective/kl": 36.50178527832031, "objective/non_score_reward": -3.650178909301758, "objective/rlhf_reward": -13.08494385460251, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 5.9299421310424805, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.63154137134552, "step": 1842, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001269578933716 }, { "episode": 29504, "epoch": 0.5303231836646655, "loss/policy_avg": 0.1651121973991394, "lr": 8.822213701431494e-06, "objective/entropy": -393.5419921875, "objective/kl": 29.719928741455078, "objective/non_score_reward": -2.9719929695129395, "objective/rlhf_reward": -7.487971997261048, "objective/scores": 1.1, "policy/approxkl_avg": 2.269674777984619, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6407129764556885, "step": 1843, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0012855529785156 }, { "episode": 29520, "epoch": 0.5306107775820541, "loss/policy_avg": 0.457575261592865, "lr": 8.82157464212679e-06, "objective/entropy": -412.970703125, "objective/kl": 46.339717864990234, "objective/non_score_reward": -4.63397216796875, "objective/rlhf_reward": -16.711059208187173, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 5.536159038543701, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6251356601715088, "step": 1844, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0007238388061523 }, { "episode": 29536, "epoch": 0.5308983714994427, "loss/policy_avg": 0.20352207124233246, "lr": 8.820935582822086e-06, "objective/entropy": -384.1399841308594, "objective/kl": 29.551206588745117, "objective/non_score_reward": -2.95512056350708, "objective/rlhf_reward": -10.441880085555416, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 0.4627244472503662, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6161606311798096, "step": 1845, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.001067638397217 }, { "episode": 29552, "epoch": 0.5311859654168314, "loss/policy_avg": 0.13906988501548767, "lr": 8.820296523517383e-06, "objective/entropy": -386.9287109375, "objective/kl": 32.09199523925781, "objective/non_score_reward": -3.2091996669769287, "objective/rlhf_reward": -11.477548563216608, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.3763790130615234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7411707639694214, "step": 1846, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.001765012741089 }, { "episode": 29568, "epoch": 0.5314735593342201, "loss/policy_avg": -0.5066956281661987, "lr": 8.81965746421268e-06, "objective/entropy": -400.3838806152344, "objective/kl": 38.35633850097656, "objective/non_score_reward": -3.8356337547302246, "objective/rlhf_reward": -13.219828309790167, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 0.6074817180633545, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6242649555206299, "step": 1847, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0008466243743896 }, { "episode": 29584, "epoch": 0.5317611532516088, "loss/policy_avg": -0.11110822856426239, "lr": 8.819018404907977e-06, "objective/entropy": -415.26104736328125, "objective/kl": 35.89327621459961, "objective/non_score_reward": -3.589327335357666, "objective/rlhf_reward": -12.75318959719332, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.032853126525879, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6512523889541626, "step": 1848, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0011587142944336 }, { "episode": 29600, "epoch": 0.5320487471689974, "loss/policy_avg": 0.10872027277946472, "lr": 8.818379345603272e-06, "objective/entropy": -400.09442138671875, "objective/kl": 32.28276062011719, "objective/non_score_reward": -3.228276014328003, "objective/rlhf_reward": -11.251244311750519, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 8.959175109863281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6273148059844971, "step": 1849, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9985003471374512 }, { "episode": 29616, "epoch": 0.5323363410863861, "loss/policy_avg": 0.497783899307251, "lr": 8.817740286298569e-06, "objective/entropy": -388.5885009765625, "objective/kl": 30.702590942382812, "objective/non_score_reward": -3.070258855819702, "objective/rlhf_reward": -10.456206913265298, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.343198776245117, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7948764562606812, "step": 1850, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.997718095779419 }, { "episode": 29632, "epoch": 0.5326239350037747, "loss/policy_avg": 0.2671676576137543, "lr": 8.817101226993866e-06, "objective/entropy": -432.7732238769531, "objective/kl": 31.69662857055664, "objective/non_score_reward": -3.1696629524230957, "objective/rlhf_reward": -11.12239262363012, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.32167911529541, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6746838688850403, "step": 1851, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9993693828582764 }, { "episode": 29648, "epoch": 0.5329115289211633, "loss/policy_avg": 0.035516563802957535, "lr": 8.816462167689163e-06, "objective/entropy": -440.9836120605469, "objective/kl": 30.971391677856445, "objective/non_score_reward": -3.097139358520508, "objective/rlhf_reward": -7.988557434082031, "objective/scores": 1.1, "policy/approxkl_avg": 4.354112148284912, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7476191520690918, "step": 1852, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0013256072998047 }, { "episode": 29664, "epoch": 0.533199122838552, "loss/policy_avg": 1.009150505065918, "lr": 8.81582310838446e-06, "objective/entropy": -382.2260437011719, "objective/kl": 34.55077362060547, "objective/non_score_reward": -3.4550774097442627, "objective/rlhf_reward": -11.42031035423279, "objective/scores": 0.6, "policy/approxkl_avg": 2.1532769203186035, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6848031878471375, "step": 1853, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9983932971954346 }, { "episode": 29680, "epoch": 0.5334867167559406, "loss/policy_avg": 0.15012891590595245, "lr": 8.815184049079757e-06, "objective/entropy": -361.4661560058594, "objective/kl": 28.830093383789062, "objective/non_score_reward": -2.883009433746338, "objective/rlhf_reward": -9.707208748134683, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 5.7027387619018555, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5433046817779541, "step": 1854, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.99971342086792 }, { "episode": 29696, "epoch": 0.5337743106733293, "loss/policy_avg": 1.366466999053955, "lr": 8.814544989775052e-06, "objective/entropy": -413.43408203125, "objective/kl": 29.800827026367188, "objective/non_score_reward": -2.9800827503204346, "objective/rlhf_reward": -9.797625007406744, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 6.360522747039795, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.740514874458313, "step": 1855, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9982011318206787 }, { "episode": 29712, "epoch": 0.5340619045907179, "loss/policy_avg": 0.7949534058570862, "lr": 8.813905930470349e-06, "objective/entropy": -451.86260986328125, "objective/kl": 37.068763732910156, "objective/non_score_reward": -3.706876516342163, "objective/rlhf_reward": -13.427505588531496, "objective/scores": 0.35, "policy/approxkl_avg": 13.020347595214844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6969761848449707, "step": 1856, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9990066289901733 }, { "episode": 29728, "epoch": 0.5343494985081065, "loss/policy_avg": 0.28128236532211304, "lr": 8.813266871165644e-06, "objective/entropy": -134.083251953125, "objective/kl": 42.974483489990234, "objective/non_score_reward": -4.29744815826416, "objective/rlhf_reward": -15.765961249073115, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.228285312652588, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5537855625152588, "step": 1857, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000051975250244 }, { "episode": 29744, "epoch": 0.5346370924254952, "loss/policy_avg": 1.0371739864349365, "lr": 8.81262781186094e-06, "objective/entropy": -435.865966796875, "objective/kl": 41.57793045043945, "objective/non_score_reward": -4.157793045043945, "objective/rlhf_reward": -15.027052197519858, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.3608429431915283, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.624181866645813, "step": 1858, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999916434288025 }, { "episode": 29760, "epoch": 0.5349246863428838, "loss/policy_avg": 0.34511005878448486, "lr": 8.811988752556238e-06, "objective/entropy": -421.1822204589844, "objective/kl": 45.783531188964844, "objective/non_score_reward": -4.578352928161621, "objective/rlhf_reward": -16.580077902475992, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 22.724811553955078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6658442616462708, "step": 1859, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9966487884521484 }, { "episode": 29776, "epoch": 0.5352122802602725, "loss/policy_avg": 0.9107069969177246, "lr": 8.811349693251534e-06, "objective/entropy": -342.1166687011719, "objective/kl": 32.668312072753906, "objective/non_score_reward": -3.2668309211730957, "objective/rlhf_reward": -11.616726021380767, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 9.692235946655273, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6248102784156799, "step": 1860, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.997905969619751 }, { "episode": 29792, "epoch": 0.5354998741776611, "loss/policy_avg": 1.1473510265350342, "lr": 8.810710633946831e-06, "objective/entropy": -385.346923828125, "objective/kl": 40.38262939453125, "objective/non_score_reward": -4.038262844085693, "objective/rlhf_reward": -13.229332123638365, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 16.56662368774414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.721815824508667, "step": 1861, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9978728294372559 }, { "episode": 29808, "epoch": 0.5357874680950497, "loss/policy_avg": -0.5452967882156372, "lr": 8.810071574642127e-06, "objective/entropy": -419.7779541015625, "objective/kl": 37.67274475097656, "objective/non_score_reward": -3.7672743797302246, "objective/rlhf_reward": -13.6690975189209, "objective/scores": 0.35, "policy/approxkl_avg": 1.5192656517028809, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.8436002135276794, "step": 1862, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.000840425491333 }, { "episode": 29824, "epoch": 0.5360750620124385, "loss/policy_avg": 0.10396115481853485, "lr": 8.809432515337423e-06, "objective/entropy": -433.44677734375, "objective/kl": 34.4306640625, "objective/non_score_reward": -3.4430665969848633, "objective/rlhf_reward": -12.348434765537348, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 1.789155125617981, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7376961708068848, "step": 1863, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000107765197754 }, { "episode": 29840, "epoch": 0.5363626559298271, "loss/policy_avg": 0.12280648946762085, "lr": 8.80879345603272e-06, "objective/entropy": -442.9119873046875, "objective/kl": 32.202186584472656, "objective/non_score_reward": -3.2202188968658447, "objective/rlhf_reward": -11.147542015711466, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 10.399396896362305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6757767200469971, "step": 1864, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9982423782348633 }, { "episode": 29856, "epoch": 0.5366502498472158, "loss/policy_avg": 1.8007351160049438, "lr": 8.808154396728017e-06, "objective/entropy": -402.8224792480469, "objective/kl": 35.46202087402344, "objective/non_score_reward": -3.5462019443511963, "objective/rlhf_reward": -12.062101187483343, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.2923994064331055, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.6616877317428589, "step": 1865, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9989454746246338 }, { "episode": 29872, "epoch": 0.5369378437646044, "loss/policy_avg": 2.497861862182617, "lr": 8.807515337423314e-06, "objective/entropy": -461.1963806152344, "objective/kl": 37.91127014160156, "objective/non_score_reward": -3.7911272048950195, "objective/rlhf_reward": -13.683556559498669, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.040108680725098, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6053213477134705, "step": 1866, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9989283084869385 }, { "episode": 29888, "epoch": 0.537225437681993, "loss/policy_avg": 0.7831843495368958, "lr": 8.806876278118611e-06, "objective/entropy": -388.92327880859375, "objective/kl": 37.51856231689453, "objective/non_score_reward": -3.7518563270568848, "objective/rlhf_reward": -15.007424354553223, "objective/scores": 0.0, "policy/approxkl_avg": 10.043133735656738, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6960707902908325, "step": 1867, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9988772869110107 }, { "episode": 29904, "epoch": 0.5375130315993817, "loss/policy_avg": 0.09445878863334656, "lr": 8.806237218813906e-06, "objective/entropy": -409.1226501464844, "objective/kl": 33.533531188964844, "objective/non_score_reward": -3.3533530235290527, "objective/rlhf_reward": -11.932458999569775, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 1.012753963470459, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6381877064704895, "step": 1868, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0004613399505615 }, { "episode": 29920, "epoch": 0.5378006255167703, "loss/policy_avg": -0.18073031306266785, "lr": 8.805598159509203e-06, "objective/entropy": -398.24072265625, "objective/kl": 36.278446197509766, "objective/non_score_reward": -3.6278445720672607, "objective/rlhf_reward": -12.778045193354288, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 47.01924133300781, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8266734480857849, "step": 1869, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9989914894104004 }, { "episode": 29936, "epoch": 0.538088219434159, "loss/policy_avg": 1.165833830833435, "lr": 8.8049591002045e-06, "objective/entropy": -408.45098876953125, "objective/kl": 35.12698745727539, "objective/non_score_reward": -3.5126986503601074, "objective/rlhf_reward": -12.535022818835909, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.996471643447876, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6892898082733154, "step": 1870, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9991648197174072 }, { "episode": 29952, "epoch": 0.5383758133515476, "loss/policy_avg": 0.18403103947639465, "lr": 8.804320040899797e-06, "objective/entropy": -405.0128173828125, "objective/kl": 28.530359268188477, "objective/non_score_reward": -2.8530359268188477, "objective/rlhf_reward": -7.0121441841125485, "objective/scores": 1.1, "policy/approxkl_avg": 2.8809962272644043, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.748741865158081, "step": 1871, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0005087852478027 }, { "episode": 29968, "epoch": 0.5386634072689362, "loss/policy_avg": 0.10751471668481827, "lr": 8.803680981595094e-06, "objective/entropy": -360.5135803222656, "objective/kl": 37.048187255859375, "objective/non_score_reward": -3.7048187255859375, "objective/rlhf_reward": -13.419274425506591, "objective/scores": 0.35, "policy/approxkl_avg": 6.395877361297607, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6104768514633179, "step": 1872, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9991977214813232 }, { "episode": 29984, "epoch": 0.5389510011863249, "loss/policy_avg": 0.6946390867233276, "lr": 8.803041922290389e-06, "objective/entropy": -351.6586608886719, "objective/kl": 43.889610290527344, "objective/non_score_reward": -4.388960838317871, "objective/rlhf_reward": -15.731015081676553, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 15.217326164245605, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8224897384643555, "step": 1873, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9982489347457886 }, { "episode": 30000, "epoch": 0.5392385951037135, "loss/policy_avg": 0.5935176610946655, "lr": 8.802402862985686e-06, "objective/entropy": -413.40277099609375, "objective/kl": 30.107940673828125, "objective/non_score_reward": -3.010794162750244, "objective/rlhf_reward": -10.643176889419555, "objective/scores": 0.35, "policy/approxkl_avg": 13.160199165344238, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7384727001190186, "step": 1874, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.998518705368042 }, { "episode": 30016, "epoch": 0.5395261890211022, "loss/policy_avg": 1.2324717044830322, "lr": 8.801763803680983e-06, "objective/entropy": -385.6772155761719, "objective/kl": 29.91301155090332, "objective/non_score_reward": -2.9913010597229004, "objective/rlhf_reward": -10.639691147833986, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 7.299016952514648, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6093276739120483, "step": 1875, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.997147560119629 }, { "episode": 30032, "epoch": 0.5398137829384908, "loss/policy_avg": -0.11620958149433136, "lr": 8.80112474437628e-06, "objective/entropy": -426.99334716796875, "objective/kl": 34.88370895385742, "objective/non_score_reward": -3.488370895385742, "objective/rlhf_reward": -9.553482866287231, "objective/scores": 1.1, "policy/approxkl_avg": 106.58057403564453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7247900366783142, "step": 1876, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.000845432281494 }, { "episode": 30048, "epoch": 0.5401013768558794, "loss/policy_avg": 0.5161746740341187, "lr": 8.800485685071576e-06, "objective/entropy": -403.6054382324219, "objective/kl": 35.36531066894531, "objective/non_score_reward": -3.5365312099456787, "objective/rlhf_reward": -9.746125078201294, "objective/scores": 1.1, "policy/approxkl_avg": 5.007146835327148, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8434112071990967, "step": 1877, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.000014543533325 }, { "episode": 30064, "epoch": 0.5403889707732682, "loss/policy_avg": -0.016331974416971207, "lr": 8.799846625766873e-06, "objective/entropy": -420.46575927734375, "objective/kl": 35.63881301879883, "objective/non_score_reward": -3.5638811588287354, "objective/rlhf_reward": -12.593665128172027, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 2.004625082015991, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.788866400718689, "step": 1878, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.000863552093506 }, { "episode": 30080, "epoch": 0.5406765646906568, "loss/policy_avg": 1.0102704763412476, "lr": 8.799207566462168e-06, "objective/entropy": -360.9938049316406, "objective/kl": 35.284542083740234, "objective/non_score_reward": -3.528454542160034, "objective/rlhf_reward": -12.735215523330073, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 12.315423965454102, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.689734935760498, "step": 1879, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.9982883930206299 }, { "episode": 30096, "epoch": 0.5409641586080455, "loss/policy_avg": 0.4730093479156494, "lr": 8.798568507157465e-06, "objective/entropy": -431.2374267578125, "objective/kl": 36.703895568847656, "objective/non_score_reward": -3.6703896522521973, "objective/rlhf_reward": -13.28155884742737, "objective/scores": 0.35, "policy/approxkl_avg": 3.63038969039917, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6414831876754761, "step": 1880, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.998375415802002 }, { "episode": 30112, "epoch": 0.5412517525254341, "loss/policy_avg": 0.43901675939559937, "lr": 8.79792944785276e-06, "objective/entropy": -429.146240234375, "objective/kl": 44.59827423095703, "objective/non_score_reward": -4.459827423095703, "objective/rlhf_reward": -14.915591870189878, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.746644020080566, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6550036668777466, "step": 1881, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9982911348342896 }, { "episode": 30128, "epoch": 0.5415393464428228, "loss/policy_avg": 0.1869012415409088, "lr": 8.797290388548057e-06, "objective/entropy": -381.11676025390625, "objective/kl": 35.51026916503906, "objective/non_score_reward": -3.5510268211364746, "objective/rlhf_reward": -12.379278774532388, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 0.9573068618774414, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.617135226726532, "step": 1882, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.000216245651245 }, { "episode": 30144, "epoch": 0.5418269403602114, "loss/policy_avg": -0.048758573830127716, "lr": 8.796651329243354e-06, "objective/entropy": -427.4644775390625, "objective/kl": 36.07917785644531, "objective/non_score_reward": -3.6079182624816895, "objective/rlhf_reward": -12.769813065946686, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 6.684242248535156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7206639647483826, "step": 1883, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9999747276306152 }, { "episode": 30160, "epoch": 0.5421145342776, "loss/policy_avg": 0.414611279964447, "lr": 8.796012269938651e-06, "objective/entropy": -384.4620666503906, "objective/kl": 37.16790008544922, "objective/non_score_reward": -3.716789960861206, "objective/rlhf_reward": -13.48855767497192, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 1.949611783027649, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6061092615127563, "step": 1884, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9999921321868896 }, { "episode": 30176, "epoch": 0.5424021281949887, "loss/policy_avg": -0.005995124578475952, "lr": 8.795373210633948e-06, "objective/entropy": -344.6195373535156, "objective/kl": 28.49842071533203, "objective/non_score_reward": -2.849842071533203, "objective/rlhf_reward": -9.843109576907707, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 6.092833518981934, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6502339839935303, "step": 1885, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.001105308532715 }, { "episode": 30192, "epoch": 0.5426897221123773, "loss/policy_avg": 0.5528745651245117, "lr": 8.794734151329243e-06, "objective/entropy": -340.13983154296875, "objective/kl": 48.298675537109375, "objective/non_score_reward": -4.829867362976074, "objective/rlhf_reward": -17.89563806792077, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 3.548684597015381, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5107759237289429, "step": 1886, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9997892379760742 }, { "episode": 30208, "epoch": 0.542977316029766, "loss/policy_avg": 1.2769412994384766, "lr": 8.79409509202454e-06, "objective/entropy": -445.042724609375, "objective/kl": 33.84895324707031, "objective/non_score_reward": -3.3848953247070312, "objective/rlhf_reward": -12.16097901114593, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.024454355239868, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5982992053031921, "step": 1887, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0003395080566406 }, { "episode": 30224, "epoch": 0.5432649099471546, "loss/policy_avg": 0.3335120379924774, "lr": 8.793456032719837e-06, "objective/entropy": -406.7044372558594, "objective/kl": 34.271934509277344, "objective/non_score_reward": -3.4271938800811768, "objective/rlhf_reward": -13.708775043487549, "objective/scores": 0.0, "policy/approxkl_avg": 41.10575866699219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7102169990539551, "step": 1888, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.998976707458496 }, { "episode": 30240, "epoch": 0.5435525038645432, "loss/policy_avg": 0.07910682260990143, "lr": 8.792816973415134e-06, "objective/entropy": -424.95025634765625, "objective/kl": 30.39816665649414, "objective/non_score_reward": -3.0398166179656982, "objective/rlhf_reward": -10.643494689258274, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 2.5409979820251465, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5519711375236511, "step": 1889, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9981714487075806 }, { "episode": 30256, "epoch": 0.5438400977819319, "loss/policy_avg": -0.06054295599460602, "lr": 8.79217791411043e-06, "objective/entropy": -427.6455078125, "objective/kl": 32.384437561035156, "objective/non_score_reward": -3.238443613052368, "objective/rlhf_reward": -11.3975153853565, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 63.454803466796875, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7060720920562744, "step": 1890, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.001728057861328 }, { "episode": 30272, "epoch": 0.5441276916993205, "loss/policy_avg": -0.04710587114095688, "lr": 8.791538854805728e-06, "objective/entropy": -454.0593566894531, "objective/kl": 30.00592803955078, "objective/non_score_reward": -3.0005929470062256, "objective/rlhf_reward": -10.177542562755654, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 15.102546691894531, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7401454448699951, "step": 1891, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0014076232910156 }, { "episode": 30288, "epoch": 0.5444152856167092, "loss/policy_avg": 0.9605652093887329, "lr": 8.790899795501023e-06, "objective/entropy": -412.4889831542969, "objective/kl": 30.16094398498535, "objective/non_score_reward": -3.01609468460083, "objective/rlhf_reward": -10.640546400745478, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.470215320587158, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6489913463592529, "step": 1892, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.998854160308838 }, { "episode": 30304, "epoch": 0.5447028795340979, "loss/policy_avg": 0.15633176267147064, "lr": 8.79026073619632e-06, "objective/entropy": -437.66778564453125, "objective/kl": 31.831527709960938, "objective/non_score_reward": -3.183152675628662, "objective/rlhf_reward": -11.407097730666322, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 56.151588439941406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5976850986480713, "step": 1893, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9981919527053833 }, { "episode": 30320, "epoch": 0.5449904734514865, "loss/policy_avg": 0.2935134470462799, "lr": 8.789621676891616e-06, "objective/entropy": -408.64520263671875, "objective/kl": 38.42247772216797, "objective/non_score_reward": -3.84224796295166, "objective/rlhf_reward": -13.990389683333735, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 23.678630828857422, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6335173845291138, "step": 1894, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9992420673370361 }, { "episode": 30336, "epoch": 0.5452780673688752, "loss/policy_avg": 0.17721040546894073, "lr": 8.788982617586913e-06, "objective/entropy": -347.135498046875, "objective/kl": 25.30129623413086, "objective/non_score_reward": -2.530129909515381, "objective/rlhf_reward": -5.720519399642944, "objective/scores": 1.1, "policy/approxkl_avg": 39.93634033203125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7696830034255981, "step": 1895, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998798131942749 }, { "episode": 30352, "epoch": 0.5455656612862638, "loss/policy_avg": -0.04025883972644806, "lr": 8.78834355828221e-06, "objective/entropy": -440.85137939453125, "objective/kl": 24.157392501831055, "objective/non_score_reward": -2.4157392978668213, "objective/rlhf_reward": -8.05883720881136, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 12.33606243133545, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6774711608886719, "step": 1896, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9997658729553223 }, { "episode": 30368, "epoch": 0.5458532552036525, "loss/policy_avg": 0.3346291780471802, "lr": 8.787704498977505e-06, "objective/entropy": -380.12078857421875, "objective/kl": 25.318157196044922, "objective/non_score_reward": -2.531815767288208, "objective/rlhf_reward": -7.727263069152832, "objective/scores": 0.6, "policy/approxkl_avg": 0.46912604570388794, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5926394462585449, "step": 1897, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.001295566558838 }, { "episode": 30384, "epoch": 0.5461408491210411, "loss/policy_avg": 4.834578037261963, "lr": 8.787065439672802e-06, "objective/entropy": -403.2817687988281, "objective/kl": 36.78810119628906, "objective/non_score_reward": -3.6788105964660645, "objective/rlhf_reward": -12.981908933321634, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 5.106264114379883, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7132128477096558, "step": 1898, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000145435333252 }, { "episode": 30400, "epoch": 0.5464284430384297, "loss/policy_avg": 0.20910179615020752, "lr": 8.7864263803681e-06, "objective/entropy": -404.0404357910156, "objective/kl": 27.075878143310547, "objective/non_score_reward": -2.707587718963623, "objective/rlhf_reward": -6.430350458621978, "objective/scores": 1.1, "policy/approxkl_avg": 22.9856014251709, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5931397676467896, "step": 1899, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999509334564209 }, { "episode": 30416, "epoch": 0.5467160369558184, "loss/policy_avg": 0.5005283355712891, "lr": 8.785787321063396e-06, "objective/entropy": -425.8792724609375, "objective/kl": 35.993370056152344, "objective/non_score_reward": -3.599337100982666, "objective/rlhf_reward": -12.73548865836917, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 13.124953269958496, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6571193933486938, "step": 1900, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9987658262252808 }, { "episode": 30432, "epoch": 0.547003630873207, "loss/policy_avg": 0.707127571105957, "lr": 8.785148261758691e-06, "objective/entropy": -408.89251708984375, "objective/kl": 31.42521095275879, "objective/non_score_reward": -3.1425209045410156, "objective/rlhf_reward": -11.228448203116088, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.722949981689453, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6954042315483093, "step": 1901, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999962329864502 }, { "episode": 30448, "epoch": 0.5472912247905957, "loss/policy_avg": 0.07642310857772827, "lr": 8.784509202453988e-06, "objective/entropy": -425.4870910644531, "objective/kl": 27.78850555419922, "objective/non_score_reward": -2.778850555419922, "objective/rlhf_reward": -8.992695989386116, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.274503231048584, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6606495976448059, "step": 1902, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9997704029083252 }, { "episode": 30464, "epoch": 0.5475788187079843, "loss/policy_avg": -0.18466167151927948, "lr": 8.783870143149285e-06, "objective/entropy": -400.5853271484375, "objective/kl": 36.4433708190918, "objective/non_score_reward": -3.6443374156951904, "objective/rlhf_reward": -12.177349662780763, "objective/scores": 0.6, "policy/approxkl_avg": 4.72365140914917, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7156212329864502, "step": 1903, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 2.000715494155884 }, { "episode": 30480, "epoch": 0.547866412625373, "loss/policy_avg": 0.03173200786113739, "lr": 8.783231083844582e-06, "objective/entropy": -418.78118896484375, "objective/kl": 38.735069274902344, "objective/non_score_reward": -3.873506546020508, "objective/rlhf_reward": -13.97825487831467, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 63.272491455078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.653575599193573, "step": 1904, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.9985655546188354 }, { "episode": 30496, "epoch": 0.5481540065427616, "loss/policy_avg": 0.20278006792068481, "lr": 8.782592024539877e-06, "objective/entropy": -403.6837463378906, "objective/kl": 31.187583923339844, "objective/non_score_reward": -3.1187584400177, "objective/rlhf_reward": -11.133398106604247, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 18.81614875793457, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6424790620803833, "step": 1905, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9980148077011108 }, { "episode": 30512, "epoch": 0.5484416004601502, "loss/policy_avg": 0.6811771988868713, "lr": 8.781952965235174e-06, "objective/entropy": -473.90093994140625, "objective/kl": 24.395801544189453, "objective/non_score_reward": -2.4395804405212402, "objective/rlhf_reward": -7.635614695326362, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 3.1705455780029297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6661382913589478, "step": 1906, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.998711109161377 }, { "episode": 30528, "epoch": 0.5487291943775389, "loss/policy_avg": 1.5239169597625732, "lr": 8.78131390593047e-06, "objective/entropy": -455.513916015625, "objective/kl": 41.25617980957031, "objective/non_score_reward": -4.125617980957031, "objective/rlhf_reward": -15.051873783679351, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 2.560035228729248, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7762479782104492, "step": 1907, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.998887300491333 }, { "episode": 30544, "epoch": 0.5490167882949276, "loss/policy_avg": 0.6401779055595398, "lr": 8.780674846625768e-06, "objective/entropy": -369.55645751953125, "objective/kl": 35.06276321411133, "objective/non_score_reward": -3.5062763690948486, "objective/rlhf_reward": -12.665855610106867, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.0147533416748047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6460734605789185, "step": 1908, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0001068115234375 }, { "episode": 30560, "epoch": 0.5493043822123163, "loss/policy_avg": -0.06551143527030945, "lr": 8.780035787321065e-06, "objective/entropy": -438.845458984375, "objective/kl": 36.21104431152344, "objective/non_score_reward": -3.6211042404174805, "objective/rlhf_reward": -12.659587974819253, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 23.18450355529785, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.702975869178772, "step": 1909, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9994276762008667 }, { "episode": 30576, "epoch": 0.5495919761297049, "loss/policy_avg": 1.0463677644729614, "lr": 8.77939672801636e-06, "objective/entropy": -395.9627685546875, "objective/kl": 41.114051818847656, "objective/non_score_reward": -4.111405372619629, "objective/rlhf_reward": -15.086371624205988, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.1322407722473145, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6432448029518127, "step": 1910, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998486042022705 }, { "episode": 30592, "epoch": 0.5498795700470935, "loss/policy_avg": 1.8733429908752441, "lr": 8.778757668711657e-06, "objective/entropy": -441.1005859375, "objective/kl": 30.269611358642578, "objective/non_score_reward": -3.026961326599121, "objective/rlhf_reward": -10.76620965292993, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.139184951782227, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7630742192268372, "step": 1911, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.99787437915802 }, { "episode": 30608, "epoch": 0.5501671639644822, "loss/policy_avg": 0.05697975307703018, "lr": 8.778118609406954e-06, "objective/entropy": -429.45123291015625, "objective/kl": 19.556673049926758, "objective/non_score_reward": -1.9556673765182495, "objective/rlhf_reward": -5.997840578826974, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 5.16174840927124, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6320275068283081, "step": 1912, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998305082321167 }, { "episode": 30624, "epoch": 0.5504547578818708, "loss/policy_avg": 0.18331314623355865, "lr": 8.77747955010225e-06, "objective/entropy": -424.5577392578125, "objective/kl": 33.609474182128906, "objective/non_score_reward": -3.3609471321105957, "objective/rlhf_reward": -12.043789243698122, "objective/scores": 0.35, "policy/approxkl_avg": 2.872340202331543, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6509475708007812, "step": 1913, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9987866878509521 }, { "episode": 30640, "epoch": 0.5507423517992595, "loss/policy_avg": 2.0698816776275635, "lr": 8.776840490797547e-06, "objective/entropy": -403.4563903808594, "objective/kl": 26.522327423095703, "objective/non_score_reward": -2.6522328853607178, "objective/rlhf_reward": -10.608931303024292, "objective/scores": 0.0, "policy/approxkl_avg": 6.603757381439209, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7082518339157104, "step": 1914, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9989047050476074 }, { "episode": 30656, "epoch": 0.5510299457166481, "loss/policy_avg": 1.278648018836975, "lr": 8.776201431492844e-06, "objective/entropy": -416.79571533203125, "objective/kl": 35.081119537353516, "objective/non_score_reward": -3.5081119537353516, "objective/rlhf_reward": -14.032448053359985, "objective/scores": 0.0, "policy/approxkl_avg": 33.39091110229492, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5837923884391785, "step": 1915, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9977941513061523 }, { "episode": 30672, "epoch": 0.5513175396340367, "loss/policy_avg": 0.8528362512588501, "lr": 8.77556237218814e-06, "objective/entropy": -423.3309326171875, "objective/kl": 33.59093475341797, "objective/non_score_reward": -3.35909366607666, "objective/rlhf_reward": -11.611546631130288, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 3.6560845375061035, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5798410177230835, "step": 1916, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0001797676086426 }, { "episode": 30688, "epoch": 0.5516051335514254, "loss/policy_avg": -0.22073540091514587, "lr": 8.774923312883436e-06, "objective/entropy": -377.3043212890625, "objective/kl": 26.552669525146484, "objective/non_score_reward": -2.6552670001983643, "objective/rlhf_reward": -10.621067762374878, "objective/scores": 0.0, "policy/approxkl_avg": 42.18861389160156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7460348606109619, "step": 1917, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.999443531036377 }, { "episode": 30704, "epoch": 0.551892727468814, "loss/policy_avg": -0.13114827871322632, "lr": 8.774284253578733e-06, "objective/entropy": -455.10888671875, "objective/kl": 21.73941421508789, "objective/non_score_reward": -2.1739416122436523, "objective/rlhf_reward": -7.295766568183899, "objective/scores": 0.35, "policy/approxkl_avg": 1.5294067859649658, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6820036172866821, "step": 1918, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000469923019409 }, { "episode": 30720, "epoch": 0.5521803213862027, "loss/policy_avg": 0.5328643321990967, "lr": 8.77364519427403e-06, "objective/entropy": -405.8944396972656, "objective/kl": 37.25814437866211, "objective/non_score_reward": -3.7258143424987793, "objective/rlhf_reward": -13.346997826304985, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 4.811860084533691, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7180016040802002, "step": 1919, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0002267360687256 }, { "episode": 30736, "epoch": 0.5524679153035913, "loss/policy_avg": 0.7952775955200195, "lr": 8.773006134969327e-06, "objective/entropy": -414.86810302734375, "objective/kl": 35.42617416381836, "objective/non_score_reward": -3.5426173210144043, "objective/rlhf_reward": -12.689517143185498, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 31.24676513671875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6751524209976196, "step": 1920, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9984729290008545 }, { "episode": 30752, "epoch": 0.5527555092209799, "loss/policy_avg": 0.7343195676803589, "lr": 8.772367075664622e-06, "objective/entropy": -409.80535888671875, "objective/kl": 29.384769439697266, "objective/non_score_reward": -2.938477039337158, "objective/rlhf_reward": -10.330076534946528, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 2.391204357147217, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.718669056892395, "step": 1921, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9992034435272217 }, { "episode": 30768, "epoch": 0.5530431031383686, "loss/policy_avg": -0.44611844420433044, "lr": 8.771728016359919e-06, "objective/entropy": -388.0859375, "objective/kl": 43.557899475097656, "objective/non_score_reward": -4.355789661407471, "objective/rlhf_reward": -15.689825312296549, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 8.103779792785645, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5824436545372009, "step": 1922, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9997856616973877 }, { "episode": 30784, "epoch": 0.5533306970557573, "loss/policy_avg": -0.07979650050401688, "lr": 8.771088957055214e-06, "objective/entropy": -399.025634765625, "objective/kl": 27.762096405029297, "objective/non_score_reward": -2.776209592819214, "objective/rlhf_reward": -11.104838490486145, "objective/scores": 0.0, "policy/approxkl_avg": 3.073312759399414, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7794772386550903, "step": 1923, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.0027174949645996 }, { "episode": 30800, "epoch": 0.553618290973146, "loss/policy_avg": 0.01149098202586174, "lr": 8.770449897750511e-06, "objective/entropy": -407.55560302734375, "objective/kl": 46.517555236816406, "objective/non_score_reward": -4.651755332946777, "objective/rlhf_reward": -17.002901825968344, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 7.507414817810059, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6711137294769287, "step": 1924, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9981261491775513 }, { "episode": 30816, "epoch": 0.5539058848905346, "loss/policy_avg": 0.9915018081665039, "lr": 8.769810838445808e-06, "objective/entropy": -512.4793701171875, "objective/kl": 23.08246612548828, "objective/non_score_reward": -2.3082470893859863, "objective/rlhf_reward": -7.832987642288208, "objective/scores": 0.35, "policy/approxkl_avg": 6.494199275970459, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6374315023422241, "step": 1925, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0009591579437256 }, { "episode": 30832, "epoch": 0.5541934788079232, "loss/policy_avg": 0.8262364864349365, "lr": 8.769171779141105e-06, "objective/entropy": -423.6557922363281, "objective/kl": 32.833152770996094, "objective/non_score_reward": -3.283315658569336, "objective/rlhf_reward": -11.733262157440185, "objective/scores": 0.35, "policy/approxkl_avg": 3.411613941192627, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7843738794326782, "step": 1926, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9997050762176514 }, { "episode": 30848, "epoch": 0.5544810727253119, "loss/policy_avg": 0.21764275431632996, "lr": 8.768532719836402e-06, "objective/entropy": -427.3155517578125, "objective/kl": 34.70395278930664, "objective/non_score_reward": -3.47039532661438, "objective/rlhf_reward": -11.934170077519353, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 11.696794509887695, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7824058532714844, "step": 1927, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9991810321807861 }, { "episode": 30864, "epoch": 0.5547686666427005, "loss/policy_avg": -0.025159945711493492, "lr": 8.767893660531698e-06, "objective/entropy": -391.4906005859375, "objective/kl": 32.45024108886719, "objective/non_score_reward": -3.2450242042541504, "objective/rlhf_reward": -8.58009729385376, "objective/scores": 1.1, "policy/approxkl_avg": 1.391406536102295, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6593641638755798, "step": 1928, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.0016846656799316 }, { "episode": 30880, "epoch": 0.5550562605600892, "loss/policy_avg": 4.7467732429504395, "lr": 8.767254601226994e-06, "objective/entropy": -413.7510986328125, "objective/kl": 29.91855812072754, "objective/non_score_reward": -2.9918558597564697, "objective/rlhf_reward": -10.543591101368037, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.765658378601074, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6808304786682129, "step": 1929, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9998096227645874 }, { "episode": 30896, "epoch": 0.5553438544774778, "loss/policy_avg": 1.2263035774230957, "lr": 8.76661554192229e-06, "objective/entropy": -430.93548583984375, "objective/kl": 42.155094146728516, "objective/non_score_reward": -4.215508937835693, "objective/rlhf_reward": -15.483434536544184, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 4.349003791809082, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6266762018203735, "step": 1930, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9973292350769043 }, { "episode": 30912, "epoch": 0.5556314483948664, "loss/policy_avg": 0.26843756437301636, "lr": 8.765976482617587e-06, "objective/entropy": -373.7245788574219, "objective/kl": 40.3361701965332, "objective/non_score_reward": -4.03361701965332, "objective/rlhf_reward": -14.472608094633209, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 55.20069885253906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7185014486312866, "step": 1931, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9987622499465942 }, { "episode": 30928, "epoch": 0.5559190423122551, "loss/policy_avg": 1.9504499435424805, "lr": 8.765337423312884e-06, "objective/entropy": -360.90557861328125, "objective/kl": 30.98790740966797, "objective/non_score_reward": -3.0987911224365234, "objective/rlhf_reward": -7.99516395330429, "objective/scores": 1.1, "policy/approxkl_avg": 9.15023422241211, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6674954891204834, "step": 1932, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.999056100845337 }, { "episode": 30944, "epoch": 0.5562066362296437, "loss/policy_avg": -0.32322046160697937, "lr": 8.764698364008181e-06, "objective/entropy": -421.4532470703125, "objective/kl": 36.986915588378906, "objective/non_score_reward": -3.6986918449401855, "objective/rlhf_reward": -13.453131726294188, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.631162166595459, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.662186861038208, "step": 1933, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.000232219696045 }, { "episode": 30960, "epoch": 0.5564942301470324, "loss/policy_avg": -0.048752009868621826, "lr": 8.764059304703476e-06, "objective/entropy": -427.42681884765625, "objective/kl": 46.13719177246094, "objective/non_score_reward": -4.6137189865112305, "objective/rlhf_reward": -16.72154213587443, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 3.5000176429748535, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6966959238052368, "step": 1934, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998393177986145 }, { "episode": 30976, "epoch": 0.556781824064421, "loss/policy_avg": 0.16182555258274078, "lr": 8.763420245398773e-06, "objective/entropy": -391.8072509765625, "objective/kl": 34.84031295776367, "objective/non_score_reward": -3.4840314388275146, "objective/rlhf_reward": -12.332005772654135, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 1.2516827583312988, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5749198794364929, "step": 1935, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9993823766708374 }, { "episode": 30992, "epoch": 0.5570694179818096, "loss/policy_avg": -0.1410239338874817, "lr": 8.76278118609407e-06, "objective/entropy": -419.3056640625, "objective/kl": 30.633142471313477, "objective/non_score_reward": -3.063314199447632, "objective/rlhf_reward": -9.329538021923277, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.3912264108657837, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6253663301467896, "step": 1936, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.00065541267395 }, { "episode": 31008, "epoch": 0.5573570118991983, "loss/policy_avg": 0.22697040438652039, "lr": 8.762142126789367e-06, "objective/entropy": -463.4902038574219, "objective/kl": 28.45330047607422, "objective/non_score_reward": -2.845329999923706, "objective/rlhf_reward": -11.381319999694824, "objective/scores": 0.0, "policy/approxkl_avg": 1.432274580001831, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.683651328086853, "step": 1937, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000584125518799 }, { "episode": 31024, "epoch": 0.5576446058165869, "loss/policy_avg": 1.191375970840454, "lr": 8.761503067484664e-06, "objective/entropy": -424.45379638671875, "objective/kl": 31.107091903686523, "objective/non_score_reward": -3.1107091903686523, "objective/rlhf_reward": -10.618008489879678, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.213705062866211, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6888469457626343, "step": 1938, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998940229415894 }, { "episode": 31040, "epoch": 0.5579321997339757, "loss/policy_avg": 0.0396411269903183, "lr": 8.76086400817996e-06, "objective/entropy": -396.374755859375, "objective/kl": 38.65277862548828, "objective/non_score_reward": -3.8652782440185547, "objective/rlhf_reward": -14.119477084189086, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.268958330154419, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6052371263504028, "step": 1939, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000861644744873 }, { "episode": 31056, "epoch": 0.5582197936513643, "loss/policy_avg": 0.4586433470249176, "lr": 8.760224948875256e-06, "objective/entropy": -385.54547119140625, "objective/kl": 37.51795196533203, "objective/non_score_reward": -3.7517952919006348, "objective/rlhf_reward": -13.059770653920111, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 11.299270629882812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7740880250930786, "step": 1940, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9986772537231445 }, { "episode": 31072, "epoch": 0.558507387568753, "loss/policy_avg": 0.08733612298965454, "lr": 8.759585889570553e-06, "objective/entropy": -405.750732421875, "objective/kl": 37.07811737060547, "objective/non_score_reward": -3.7078118324279785, "objective/rlhf_reward": -13.006418104442666, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 0.44874507188796997, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5844134092330933, "step": 1941, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.001251220703125 }, { "episode": 31088, "epoch": 0.5587949814861416, "loss/policy_avg": 0.5231389403343201, "lr": 8.75894683026585e-06, "objective/entropy": -349.93475341796875, "objective/kl": 44.573455810546875, "objective/non_score_reward": -4.457345485687256, "objective/rlhf_reward": -16.48774628928247, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 9.930280685424805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7412935495376587, "step": 1942, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9970029592514038 }, { "episode": 31104, "epoch": 0.5590825754035302, "loss/policy_avg": 1.1834297180175781, "lr": 8.758307770961147e-06, "objective/entropy": -376.8563232421875, "objective/kl": 46.76148223876953, "objective/non_score_reward": -4.676148414611816, "objective/rlhf_reward": -17.100473437372763, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 4.97642707824707, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6049227714538574, "step": 1943, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9996147155761719 }, { "episode": 31120, "epoch": 0.5593701693209189, "loss/policy_avg": 0.6898127794265747, "lr": 8.757668711656443e-06, "objective/entropy": -400.8194580078125, "objective/kl": 35.496726989746094, "objective/non_score_reward": -3.5496726036071777, "objective/rlhf_reward": -12.857055237799315, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 3.294158458709717, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5990235805511475, "step": 1944, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9985532760620117 }, { "episode": 31136, "epoch": 0.5596577632383075, "loss/policy_avg": 3.615034341812134, "lr": 8.757029652351739e-06, "objective/entropy": -389.68914794921875, "objective/kl": 37.99871063232422, "objective/non_score_reward": -3.7998712062835693, "objective/rlhf_reward": -13.683712565692598, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.8563079833984375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7293685674667358, "step": 1945, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0018317699432373 }, { "episode": 31152, "epoch": 0.5599453571556962, "loss/policy_avg": 0.06291456520557404, "lr": 8.756390593047036e-06, "objective/entropy": -407.5872802734375, "objective/kl": 33.16802215576172, "objective/non_score_reward": -3.3168020248413086, "objective/rlhf_reward": -11.816610436053619, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 0.757123589515686, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6693398356437683, "step": 1946, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000835418701172 }, { "episode": 31168, "epoch": 0.5602329510730848, "loss/policy_avg": -0.020069241523742676, "lr": 8.75575153374233e-06, "objective/entropy": -409.98370361328125, "objective/kl": 29.820465087890625, "objective/non_score_reward": -2.982046604156494, "objective/rlhf_reward": -10.266326313436615, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 3.6394753456115723, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6026859283447266, "step": 1947, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9990215301513672 }, { "episode": 31184, "epoch": 0.5605205449904734, "loss/policy_avg": 0.34713196754455566, "lr": 8.755112474437628e-06, "objective/entropy": -436.7591552734375, "objective/kl": 45.081016540527344, "objective/non_score_reward": -4.508101463317871, "objective/rlhf_reward": -16.428287539545614, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 3.69284987449646, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6609814763069153, "step": 1948, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9977710247039795 }, { "episode": 31200, "epoch": 0.5608081389078621, "loss/policy_avg": 1.2998406887054443, "lr": 8.754473415132924e-06, "objective/entropy": -361.65325927734375, "objective/kl": 43.858665466308594, "objective/non_score_reward": -4.385866641998291, "objective/rlhf_reward": -15.93934658533724, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 17.617591857910156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5252654552459717, "step": 1949, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9969923496246338 }, { "episode": 31216, "epoch": 0.5610957328252507, "loss/policy_avg": 0.012568116188049316, "lr": 8.753834355828221e-06, "objective/entropy": -423.6449890136719, "objective/kl": 43.5527229309082, "objective/non_score_reward": -4.35527229309082, "objective/rlhf_reward": -15.596261616024087, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 0.9570385217666626, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.521755576133728, "step": 1950, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.999729871749878 }, { "episode": 31232, "epoch": 0.5613833267426394, "loss/policy_avg": 0.4782177805900574, "lr": 8.753195296523518e-06, "objective/entropy": -431.0830383300781, "objective/kl": 39.72227478027344, "objective/non_score_reward": -3.9722275733947754, "objective/rlhf_reward": -14.547274401693969, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 9.08526611328125, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6779612302780151, "step": 1951, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9982211589813232 }, { "episode": 31248, "epoch": 0.561670920660028, "loss/policy_avg": 0.9650096893310547, "lr": 8.752556237218815e-06, "objective/entropy": -416.23699951171875, "objective/kl": 29.791366577148438, "objective/non_score_reward": -2.9791369438171387, "objective/rlhf_reward": -10.435594561512827, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 3.317319869995117, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6212369799613953, "step": 1952, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0006825923919678 }, { "episode": 31264, "epoch": 0.5619585145774166, "loss/policy_avg": 0.30630186200141907, "lr": 8.75191717791411e-06, "objective/entropy": -422.84033203125, "objective/kl": 26.326093673706055, "objective/non_score_reward": -2.6326093673706055, "objective/rlhf_reward": -8.13043794631958, "objective/scores": 0.6, "policy/approxkl_avg": 60.704864501953125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7889204025268555, "step": 1953, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.999643087387085 }, { "episode": 31280, "epoch": 0.5622461084948054, "loss/policy_avg": -0.06008259952068329, "lr": 8.751278118609407e-06, "objective/entropy": -408.38763427734375, "objective/kl": 48.580989837646484, "objective/non_score_reward": -4.858098983764648, "objective/rlhf_reward": -18.073146068786066, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 1.251701831817627, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7160055637359619, "step": 1954, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0030007362365723 }, { "episode": 31296, "epoch": 0.562533702412194, "loss/policy_avg": 0.9898973703384399, "lr": 8.750639059304704e-06, "objective/entropy": -485.0399169921875, "objective/kl": 29.26266860961914, "objective/non_score_reward": -2.926266670227051, "objective/rlhf_reward": -10.224114301617504, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 9.360811233520508, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7378222942352295, "step": 1955, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.999152421951294 }, { "episode": 31312, "epoch": 0.5628212963295827, "loss/policy_avg": -0.11034160107374191, "lr": 8.750000000000001e-06, "objective/entropy": -387.22503662109375, "objective/kl": 38.42030334472656, "objective/non_score_reward": -3.8420300483703613, "objective/rlhf_reward": -13.420709202961858, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 28.666919708251953, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6756678223609924, "step": 1956, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9990785121917725 }, { "episode": 31328, "epoch": 0.5631088902469713, "loss/policy_avg": 2.1097350120544434, "lr": 8.749360940695298e-06, "objective/entropy": -279.7330017089844, "objective/kl": 34.413421630859375, "objective/non_score_reward": -3.4413421154022217, "objective/rlhf_reward": -12.4061181184992, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.6012513637542725, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5673644542694092, "step": 1957, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0020904541015625 }, { "episode": 31344, "epoch": 0.56339648416436, "loss/policy_avg": 0.14340202510356903, "lr": 8.748721881390595e-06, "objective/entropy": -389.0799560546875, "objective/kl": 39.62559509277344, "objective/non_score_reward": -3.962559223175049, "objective/rlhf_reward": -13.450237369537355, "objective/scores": 0.6, "policy/approxkl_avg": 7.059556484222412, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5443567633628845, "step": 1958, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.9989715814590454 }, { "episode": 31360, "epoch": 0.5636840780817486, "loss/policy_avg": 1.8846603631973267, "lr": 8.74808282208589e-06, "objective/entropy": -444.093994140625, "objective/kl": 45.765541076660156, "objective/non_score_reward": -4.576554298400879, "objective/rlhf_reward": -16.6443576864606, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 18.80606460571289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6066471338272095, "step": 1959, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9994946718215942 }, { "episode": 31376, "epoch": 0.5639716719991372, "loss/policy_avg": 0.08173228055238724, "lr": 8.747443762781187e-06, "objective/entropy": -394.915771484375, "objective/kl": 38.63004684448242, "objective/non_score_reward": -3.863004684448242, "objective/rlhf_reward": -13.329312267080816, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 19.6087646484375, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6044374704360962, "step": 1960, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000743865966797 }, { "episode": 31392, "epoch": 0.5642592659165259, "loss/policy_avg": 0.14415226876735687, "lr": 8.746804703476484e-06, "objective/entropy": -372.4490966796875, "objective/kl": 30.687557220458984, "objective/non_score_reward": -3.068756103515625, "objective/rlhf_reward": -10.875023937225341, "objective/scores": 0.35, "policy/approxkl_avg": 2.177422285079956, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6854466199874878, "step": 1961, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000678062438965 }, { "episode": 31408, "epoch": 0.5645468598339145, "loss/policy_avg": 0.5021145939826965, "lr": 8.74616564417178e-06, "objective/entropy": -364.66485595703125, "objective/kl": 28.377365112304688, "objective/non_score_reward": -2.8377366065979004, "objective/rlhf_reward": -10.0093101768786, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.675323486328125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4377620220184326, "step": 1962, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0037808418273926 }, { "episode": 31424, "epoch": 0.5648344537513031, "loss/policy_avg": 0.6306454539299011, "lr": 8.745526584867077e-06, "objective/entropy": -328.26885986328125, "objective/kl": 29.911479949951172, "objective/non_score_reward": -2.991147994995117, "objective/rlhf_reward": -10.622956684141784, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 2.4721624851226807, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4944919943809509, "step": 1963, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000112295150757 }, { "episode": 31440, "epoch": 0.5651220476686918, "loss/policy_avg": 0.48608070611953735, "lr": 8.744887525562373e-06, "objective/entropy": -389.5670166015625, "objective/kl": 37.015926361083984, "objective/non_score_reward": -3.701592445373535, "objective/rlhf_reward": -13.480857286482973, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 4.475109100341797, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6682427525520325, "step": 1964, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.9988460540771484 }, { "episode": 31456, "epoch": 0.5654096415860804, "loss/policy_avg": 0.0909942165017128, "lr": 8.74424846625767e-06, "objective/entropy": -414.95111083984375, "objective/kl": 38.041297912597656, "objective/non_score_reward": -3.8041296005249023, "objective/rlhf_reward": -13.483184591929117, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 157.13272094726562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9385782480239868, "step": 1965, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.999542236328125 }, { "episode": 31472, "epoch": 0.5656972355034691, "loss/policy_avg": 0.2651042938232422, "lr": 8.743609406952966e-06, "objective/entropy": -442.9378967285156, "objective/kl": 38.39979553222656, "objective/non_score_reward": -3.839979648590088, "objective/rlhf_reward": -14.034405503302736, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 40.41947937011719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5905368328094482, "step": 1966, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9981951713562012 }, { "episode": 31488, "epoch": 0.5659848294208577, "loss/policy_avg": -0.05953323096036911, "lr": 8.742970347648263e-06, "objective/entropy": -394.1260070800781, "objective/kl": 33.29319763183594, "objective/non_score_reward": -3.329319953918457, "objective/rlhf_reward": -11.492451067241738, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 5.72397518157959, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.577817440032959, "step": 1967, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000959634780884 }, { "episode": 31504, "epoch": 0.5662724233382463, "loss/policy_avg": 0.030380478128790855, "lr": 8.742331288343558e-06, "objective/entropy": -433.0150146484375, "objective/kl": 28.464235305786133, "objective/non_score_reward": -2.846423625946045, "objective/rlhf_reward": -9.723835354269134, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 6.461413860321045, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.669304609298706, "step": 1968, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9995262622833252 }, { "episode": 31520, "epoch": 0.5665600172556351, "loss/policy_avg": -0.3630755543708801, "lr": 8.741692229038855e-06, "objective/entropy": -439.24444580078125, "objective/kl": 28.323928833007812, "objective/non_score_reward": -2.832393169403076, "objective/rlhf_reward": -9.848619583065867, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 9.46999740600586, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6480342149734497, "step": 1969, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.99942946434021 }, { "episode": 31536, "epoch": 0.5668476111730237, "loss/policy_avg": 0.3417593836784363, "lr": 8.741053169734152e-06, "objective/entropy": -428.2545166015625, "objective/kl": 32.256805419921875, "objective/non_score_reward": -3.2256805896759033, "objective/rlhf_reward": -10.502722597122192, "objective/scores": 0.6, "policy/approxkl_avg": 2.9857120513916016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7147581577301025, "step": 1970, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9991461038589478 }, { "episode": 31552, "epoch": 0.5671352050904124, "loss/policy_avg": 0.8563758134841919, "lr": 8.740414110429449e-06, "objective/entropy": -380.69403076171875, "objective/kl": 29.055360794067383, "objective/non_score_reward": -2.90553617477417, "objective/rlhf_reward": -10.243542292205197, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 5.167313575744629, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5859706401824951, "step": 1971, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.998902440071106 }, { "episode": 31568, "epoch": 0.567422799007801, "loss/policy_avg": -0.02175423502922058, "lr": 8.739775051124744e-06, "objective/entropy": -412.723388671875, "objective/kl": 27.598445892333984, "objective/non_score_reward": -2.7598447799682617, "objective/rlhf_reward": -9.615546782215205, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 7.833063125610352, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6198368072509766, "step": 1972, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9997000694274902 }, { "episode": 31584, "epoch": 0.5677103929251897, "loss/policy_avg": 0.2529011368751526, "lr": 8.739135991820041e-06, "objective/entropy": -432.72137451171875, "objective/kl": 39.43279266357422, "objective/non_score_reward": -3.94327974319458, "objective/rlhf_reward": -14.413868868087214, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 3.0191891193389893, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6362906694412231, "step": 1973, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998401165008545 }, { "episode": 31600, "epoch": 0.5679979868425783, "loss/policy_avg": 0.5850225687026978, "lr": 8.738496932515338e-06, "objective/entropy": -376.6072692871094, "objective/kl": 43.75684356689453, "objective/non_score_reward": -4.3756842613220215, "objective/rlhf_reward": -14.579018031002256, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.1123931407928467, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6705749034881592, "step": 1974, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 17, "val/ratio": 2.0005569458007812 }, { "episode": 31616, "epoch": 0.5682855807599669, "loss/policy_avg": -0.07618226110935211, "lr": 8.737857873210635e-06, "objective/entropy": -405.18133544921875, "objective/kl": 40.11215591430664, "objective/non_score_reward": -4.011215686798096, "objective/rlhf_reward": -14.529090487750704, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 1.5373365879058838, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6181695461273193, "step": 1975, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.000474452972412 }, { "episode": 31632, "epoch": 0.5685731746773556, "loss/policy_avg": 0.9590408802032471, "lr": 8.737218813905932e-06, "objective/entropy": -231.49810791015625, "objective/kl": 30.701332092285156, "objective/non_score_reward": -3.0701332092285156, "objective/rlhf_reward": -10.82993445834671, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 24.826133728027344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5576213598251343, "step": 1976, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998095989227295 }, { "episode": 31648, "epoch": 0.5688607685947442, "loss/policy_avg": 0.45689743757247925, "lr": 8.736579754601227e-06, "objective/entropy": -394.5408935546875, "objective/kl": 22.510467529296875, "objective/non_score_reward": -2.251046895980835, "objective/rlhf_reward": -7.523234727795481, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 5.653722286224365, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8274475932121277, "step": 1977, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9999864101409912 }, { "episode": 31664, "epoch": 0.5691483625121329, "loss/policy_avg": 0.23244763910770416, "lr": 8.735940695296524e-06, "objective/entropy": -433.43060302734375, "objective/kl": 33.31128692626953, "objective/non_score_reward": -3.3311288356781006, "objective/rlhf_reward": -11.90068324347314, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 5.945277214050293, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6741795539855957, "step": 1978, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9989917278289795 }, { "episode": 31680, "epoch": 0.5694359564295215, "loss/policy_avg": 0.28232908248901367, "lr": 8.73530163599182e-06, "objective/entropy": -392.271728515625, "objective/kl": 23.622833251953125, "objective/non_score_reward": -2.36228346824646, "objective/rlhf_reward": -7.787274365842926, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 7.521617412567139, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6440614461898804, "step": 1979, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9997217655181885 }, { "episode": 31696, "epoch": 0.5697235503469101, "loss/policy_avg": 0.04843209683895111, "lr": 8.734662576687118e-06, "objective/entropy": -386.32110595703125, "objective/kl": 37.637718200683594, "objective/non_score_reward": -3.7637720108032227, "objective/rlhf_reward": -13.53931626060837, "objective/scores": 0.37894294565112985, "policy/approxkl_avg": 5.142024040222168, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6033267974853516, "step": 1980, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.999253749847412 }, { "episode": 31712, "epoch": 0.5700111442642988, "loss/policy_avg": -0.06089508533477783, "lr": 8.734023517382414e-06, "objective/entropy": -418.746337890625, "objective/kl": 38.00889587402344, "objective/non_score_reward": -3.800889253616333, "objective/rlhf_reward": -13.861922076254515, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 8.117048263549805, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.667477011680603, "step": 1981, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 2.003164291381836 }, { "episode": 31728, "epoch": 0.5702987381816874, "loss/policy_avg": 0.5511655807495117, "lr": 8.733384458077711e-06, "objective/entropy": -391.0799560546875, "objective/kl": 38.276466369628906, "objective/non_score_reward": -3.8276467323303223, "objective/rlhf_reward": -13.951337063048761, "objective/scores": 0.33981246656813147, "policy/approxkl_avg": 0.7365893125534058, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5664993524551392, "step": 1982, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 2.0020337104797363 }, { "episode": 31744, "epoch": 0.570586332099076, "loss/policy_avg": 1.2590185403823853, "lr": 8.732745398773006e-06, "objective/entropy": -360.53515625, "objective/kl": 49.39750671386719, "objective/non_score_reward": -4.939750671386719, "objective/rlhf_reward": -18.38040051707397, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 12.897879600524902, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6766175627708435, "step": 1983, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 2.0010247230529785 }, { "episode": 31760, "epoch": 0.5708739260164648, "loss/policy_avg": 0.12991289794445038, "lr": 8.732106339468303e-06, "objective/entropy": -413.5390930175781, "objective/kl": 32.60546112060547, "objective/non_score_reward": -3.2605462074279785, "objective/rlhf_reward": -11.716672453910036, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 10.268634796142578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.735994815826416, "step": 1984, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 2.002065896987915 }, { "episode": 31776, "epoch": 0.5711615199338534, "loss/policy_avg": 0.24839550256729126, "lr": 8.7314672801636e-06, "objective/entropy": -415.1396484375, "objective/kl": 40.88711929321289, "objective/non_score_reward": -4.088712215423584, "objective/rlhf_reward": -14.931017000873652, "objective/scores": 0.35595802480981553, "policy/approxkl_avg": 11.5997953414917, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5925434231758118, "step": 1985, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998626708984375 }, { "episode": 31792, "epoch": 0.5714491138512421, "loss/policy_avg": 0.36862099170684814, "lr": 8.730828220858897e-06, "objective/entropy": -387.34735107421875, "objective/kl": 41.57035827636719, "objective/non_score_reward": -4.157035827636719, "objective/rlhf_reward": -14.89481021563212, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 21.14730453491211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7580534219741821, "step": 1986, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9974285364151 }, { "episode": 31808, "epoch": 0.5717367077686307, "loss/policy_avg": 0.9366190433502197, "lr": 8.730189161554194e-06, "objective/entropy": -428.2396545410156, "objective/kl": 38.03770065307617, "objective/non_score_reward": -3.803770065307617, "objective/rlhf_reward": -13.734127166684033, "objective/scores": 0.3702381544273198, "policy/approxkl_avg": 14.858156204223633, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.61741042137146, "step": 1987, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9998137950897217 }, { "episode": 31824, "epoch": 0.5720243016860194, "loss/policy_avg": -0.019822321832180023, "lr": 8.72955010224949e-06, "objective/entropy": -408.2901611328125, "objective/kl": 32.86903381347656, "objective/non_score_reward": -3.2869033813476562, "objective/rlhf_reward": -11.769011595336298, "objective/scores": 0.34465054211822604, "policy/approxkl_avg": 3.686215400695801, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6954542398452759, "step": 1988, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999575138092041 }, { "episode": 31840, "epoch": 0.572311895603408, "loss/policy_avg": 0.5835769176483154, "lr": 8.728911042944786e-06, "objective/entropy": -375.736572265625, "objective/kl": 45.67652130126953, "objective/non_score_reward": -4.567651748657227, "objective/rlhf_reward": -16.82000956973587, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 13.066511154174805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.606543242931366, "step": 1989, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9994808435440063 }, { "episode": 31856, "epoch": 0.5725994895207966, "loss/policy_avg": 0.12761659920215607, "lr": 8.728271983640081e-06, "objective/entropy": -395.0611267089844, "objective/kl": 39.675968170166016, "objective/non_score_reward": -3.967597007751465, "objective/rlhf_reward": -14.470387911796571, "objective/scores": 0.35, "policy/approxkl_avg": 30.816078186035156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5694636106491089, "step": 1990, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9987260103225708 }, { "episode": 31872, "epoch": 0.5728870834381853, "loss/policy_avg": 9.85373306274414, "lr": 8.727632924335378e-06, "objective/entropy": -434.45928955078125, "objective/kl": 35.62004089355469, "objective/non_score_reward": -3.562004327774048, "objective/rlhf_reward": -9.848017311096191, "objective/scores": 1.1, "policy/approxkl_avg": 2.7024757862091064, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.6357365846633911, "step": 1991, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0227861404418945 }, { "episode": 31888, "epoch": 0.5731746773555739, "loss/policy_avg": 0.8553334474563599, "lr": 8.726993865030675e-06, "objective/entropy": -422.55926513671875, "objective/kl": 29.18614959716797, "objective/non_score_reward": -2.9186151027679443, "objective/rlhf_reward": -10.118200628963068, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 11.778656005859375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6409720182418823, "step": 1992, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999786376953125 }, { "episode": 31904, "epoch": 0.5734622712729626, "loss/policy_avg": 0.37166500091552734, "lr": 8.726354805725972e-06, "objective/entropy": -417.41204833984375, "objective/kl": 38.29414749145508, "objective/non_score_reward": -3.8294146060943604, "objective/rlhf_reward": -13.65579844039737, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 5.944975852966309, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7124284505844116, "step": 1993, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9977558851242065 }, { "episode": 31920, "epoch": 0.5737498651903512, "loss/policy_avg": -0.17341525852680206, "lr": 8.725715746421269e-06, "objective/entropy": -429.22857666015625, "objective/kl": 25.688812255859375, "objective/non_score_reward": -2.5688815116882324, "objective/rlhf_reward": -8.824927906604156, "objective/scores": 0.36264953503719355, "policy/approxkl_avg": 0.703777551651001, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6408061385154724, "step": 1994, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 2.0024032592773438 }, { "episode": 31936, "epoch": 0.5740374591077398, "loss/policy_avg": 0.4482753872871399, "lr": 8.725076687116566e-06, "objective/entropy": -404.13861083984375, "objective/kl": 42.459266662597656, "objective/non_score_reward": -4.245926380157471, "objective/rlhf_reward": -15.250372425715128, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 7.863109588623047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6432361602783203, "step": 1995, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9992403984069824 }, { "episode": 31952, "epoch": 0.5743250530251285, "loss/policy_avg": 0.656990647315979, "lr": 8.72443762781186e-06, "objective/entropy": -359.4864501953125, "objective/kl": 22.79832649230957, "objective/non_score_reward": -2.279832601547241, "objective/rlhf_reward": -6.9966242931046825, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 1.5574357509613037, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.60481858253479, "step": 1996, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.998363971710205 }, { "episode": 31968, "epoch": 0.5746126469425171, "loss/policy_avg": 0.28319376707077026, "lr": 8.723798568507158e-06, "objective/entropy": -424.09393310546875, "objective/kl": 43.706398010253906, "objective/non_score_reward": -4.370640277862549, "objective/rlhf_reward": -15.878440890375693, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 26.99090576171875, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7011884450912476, "step": 1997, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999915599822998 }, { "episode": 31984, "epoch": 0.5749002408599058, "loss/policy_avg": 0.15383091568946838, "lr": 8.723159509202455e-06, "objective/entropy": -355.4871826171875, "objective/kl": 36.945648193359375, "objective/non_score_reward": -3.6945648193359375, "objective/rlhf_reward": -13.452746901541872, "objective/scores": 0.3313782131597591, "policy/approxkl_avg": 7.352242469787598, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7068512439727783, "step": 1998, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0004754066467285 }, { "episode": 32000, "epoch": 0.5751878347772945, "loss/policy_avg": 0.29721391201019287, "lr": 8.722520449897751e-06, "objective/entropy": -368.88641357421875, "objective/kl": 21.11037826538086, "objective/non_score_reward": -2.1110377311706543, "objective/rlhf_reward": -7.102515271216063, "objective/scores": 0.33540891336663825, "policy/approxkl_avg": 1.1353716850280762, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6281009316444397, "step": 1999, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.9985047578811646 } ], "logging_steps": 500, "max_steps": 7824, "num_input_tokens_seen": 0, "num_train_epochs": 9.0, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": true, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }