diff --git "a/checkpoint-214/trainer_state.json" "b/checkpoint-214/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-214/trainer_state.json" @@ -0,0 +1,4315 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "episode": 109568, + "epoch": 0.4693625771076079, + "eval_steps": 500, + "global_step": 214, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "episode": 512, + "epoch": 0.0021932830705962986, + "eps": 6, + "loss/policy_avg": 0.15234875679016113, + "lr": 3e-06, + "objective/entropy": -45.407432556152344, + "objective/kl": 0.3935524821281433, + "objective/non_score_reward": -0.001967762364074588, + "objective/rlhf_reward": 4.677086353302002, + "objective/scores": 4.679054260253906, + "policy/approxkl_avg": 0.11690396815538406, + "policy/clipfrac_avg": 0.34375, + "policy/entropy_avg": 0.8878304958343506, + "step": 0, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2729, + "val/ratio": 1.000101089477539, + "val/ratio_var": 2.341874278499745e-06 + }, + { + "episode": 1024, + "epoch": 0.004386566141192597, + "eps": 6, + "loss/policy_avg": 0.10656304657459259, + "lr": 2.9882812500000002e-06, + "objective/entropy": -49.59076690673828, + "objective/kl": 0.6760815978050232, + "objective/non_score_reward": -0.0033804080449044704, + "objective/rlhf_reward": 4.551654815673828, + "objective/scores": 4.55503511428833, + "policy/approxkl_avg": 0.12034796178340912, + "policy/clipfrac_avg": 0.32421875, + "policy/entropy_avg": 0.8024331331253052, + "step": 1, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3517, + "val/ratio": 0.9998391270637512, + "val/ratio_var": 2.0723566649394343e-06 + }, + { + "episode": 1536, + "epoch": 0.006579849211788897, + "eps": 6, + "loss/policy_avg": 0.17722828686237335, + "lr": 2.9765625e-06, + "objective/entropy": -51.09684371948242, + "objective/kl": 1.214200735092163, + "objective/non_score_reward": -0.006071004085242748, + "objective/rlhf_reward": 4.974597930908203, + "objective/scores": 4.980669021606445, + "policy/approxkl_avg": 0.12037836015224457, + "policy/clipfrac_avg": 0.318359375, + "policy/entropy_avg": 0.8043380379676819, + "step": 2, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3252, + "val/ratio": 1.0001596212387085, + "val/ratio_var": 2.1233340703474823e-06 + }, + { + "episode": 2048, + "epoch": 0.008773132282385195, + "eps": 6, + "loss/policy_avg": 0.11922580748796463, + "lr": 2.96484375e-06, + "objective/entropy": -44.18528366088867, + "objective/kl": 1.4338339567184448, + "objective/non_score_reward": -0.007169169839471579, + "objective/rlhf_reward": 4.915553569793701, + "objective/scores": 4.922722339630127, + "policy/approxkl_avg": 0.16550302505493164, + "policy/clipfrac_avg": 0.375, + "policy/entropy_avg": 0.7921671271324158, + "step": 3, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2435, + "val/ratio": 0.9996773600578308, + "val/ratio_var": 2.4303119516844163e-06 + }, + { + "episode": 2560, + "epoch": 0.010966415352981495, + "eps": 6, + "loss/policy_avg": 0.15014484524726868, + "lr": 2.953125e-06, + "objective/entropy": -42.067405700683594, + "objective/kl": 2.0778450965881348, + "objective/non_score_reward": -0.01038922369480133, + "objective/rlhf_reward": 5.2342209815979, + "objective/scores": 5.244609832763672, + "policy/approxkl_avg": 0.1276472508907318, + "policy/clipfrac_avg": 0.31640625, + "policy/entropy_avg": 0.7496817111968994, + "step": 4, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1877, + "val/ratio": 1.0003182888031006, + "val/ratio_var": 2.2540482405020157e-06 + }, + { + "episode": 3072, + "epoch": 0.013159698423577794, + "eps": 6, + "loss/policy_avg": 0.1916506588459015, + "lr": 2.94140625e-06, + "objective/entropy": -39.82306671142578, + "objective/kl": 3.079103708267212, + "objective/non_score_reward": -0.015395518392324448, + "objective/rlhf_reward": 5.611381530761719, + "objective/scores": 5.626776695251465, + "policy/approxkl_avg": 0.13991527259349823, + "policy/clipfrac_avg": 0.33203125, + "policy/entropy_avg": 0.7127301692962646, + "step": 5, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1565, + "val/ratio": 0.9999192357063293, + "val/ratio_var": 1.5017312762211077e-06 + }, + { + "episode": 3584, + "epoch": 0.015352981494174092, + "eps": 6, + "loss/policy_avg": 0.1968970149755478, + "lr": 2.9296875e-06, + "objective/entropy": -32.000404357910156, + "objective/kl": 4.284768581390381, + "objective/non_score_reward": -0.021423842757940292, + "objective/rlhf_reward": 5.7136030197143555, + "objective/scores": 5.7350263595581055, + "policy/approxkl_avg": 0.1647772341966629, + "policy/clipfrac_avg": 0.3515625, + "policy/entropy_avg": 0.6934345364570618, + "step": 6, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1364, + "val/ratio": 0.9998640418052673, + "val/ratio_var": 2.735638872763957e-06 + }, + { + "episode": 4096, + "epoch": 0.01754626456477039, + "eps": 6, + "loss/policy_avg": 0.20194123685359955, + "lr": 2.91796875e-06, + "objective/entropy": -35.15373992919922, + "objective/kl": 4.795171737670898, + "objective/non_score_reward": -0.023975860327482224, + "objective/rlhf_reward": 6.039769649505615, + "objective/scores": 6.063745498657227, + "policy/approxkl_avg": 0.19040407240390778, + "policy/clipfrac_avg": 0.365234375, + "policy/entropy_avg": 0.6659716367721558, + "step": 7, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1595, + "val/ratio": 0.9995955228805542, + "val/ratio_var": 2.328864411538234e-06 + }, + { + "episode": 4608, + "epoch": 0.01973954763536669, + "eps": 6, + "loss/policy_avg": 0.2287895679473877, + "lr": 2.90625e-06, + "objective/entropy": -33.49115753173828, + "objective/kl": 6.028232097625732, + "objective/non_score_reward": -0.030141159892082214, + "objective/rlhf_reward": 6.286189556121826, + "objective/scores": 6.316330909729004, + "policy/approxkl_avg": 0.16508515179157257, + "policy/clipfrac_avg": 0.35546875, + "policy/entropy_avg": 0.6881623268127441, + "step": 8, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1875, + "val/ratio": 0.9999189376831055, + "val/ratio_var": 2.560210077717784e-06 + }, + { + "episode": 5120, + "epoch": 0.02193283070596299, + "eps": 6, + "loss/policy_avg": 0.23305176198482513, + "lr": 2.89453125e-06, + "objective/entropy": -31.895954132080078, + "objective/kl": 7.27902889251709, + "objective/non_score_reward": -0.03639514371752739, + "objective/rlhf_reward": 6.584256172180176, + "objective/scores": 6.6206512451171875, + "policy/approxkl_avg": 0.15825235843658447, + "policy/clipfrac_avg": 0.333984375, + "policy/entropy_avg": 0.6986380219459534, + "step": 9, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2233, + "val/ratio": 1.000351905822754, + "val/ratio_var": 2.54610608863004e-06 + }, + { + "episode": 5632, + "epoch": 0.024126113776559287, + "eps": 6, + "loss/policy_avg": 0.2743307948112488, + "lr": 2.8828125e-06, + "objective/entropy": -30.13780975341797, + "objective/kl": 8.618773460388184, + "objective/non_score_reward": -0.04309386759996414, + "objective/rlhf_reward": 6.671762466430664, + "objective/scores": 6.714856147766113, + "policy/approxkl_avg": 0.2310631275177002, + "policy/clipfrac_avg": 0.376953125, + "policy/entropy_avg": 0.6997334957122803, + "step": 10, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2288, + "val/ratio": 0.9999397397041321, + "val/ratio_var": 2.7119385777041316e-06 + }, + { + "episode": 6144, + "epoch": 0.026319396847155587, + "eps": 6, + "loss/policy_avg": 0.47530221939086914, + "lr": 2.87109375e-06, + "objective/entropy": -25.868932723999023, + "objective/kl": 10.715995788574219, + "objective/non_score_reward": -0.05357997864484787, + "objective/rlhf_reward": 6.660526275634766, + "objective/scores": 6.714106559753418, + "policy/approxkl_avg": 0.23832854628562927, + "policy/clipfrac_avg": 0.33984375, + "policy/entropy_avg": 0.6960214376449585, + "step": 11, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2465, + "val/ratio": 1.0001269578933716, + "val/ratio_var": 4.06193794333376e-06 + }, + { + "episode": 6656, + "epoch": 0.028512679917751884, + "eps": 6, + "loss/policy_avg": 0.2478591650724411, + "lr": 2.859375e-06, + "objective/entropy": -28.481605529785156, + "objective/kl": 11.356575012207031, + "objective/non_score_reward": -0.05678287148475647, + "objective/rlhf_reward": 7.32498025894165, + "objective/scores": 7.381762981414795, + "policy/approxkl_avg": 0.23808607459068298, + "policy/clipfrac_avg": 0.345703125, + "policy/entropy_avg": 0.6703023910522461, + "step": 12, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2564, + "val/ratio": 0.999792754650116, + "val/ratio_var": 2.613600599943311e-06 + }, + { + "episode": 7168, + "epoch": 0.030705962988348184, + "eps": 6, + "loss/policy_avg": 0.18863295018672943, + "lr": 2.84765625e-06, + "objective/entropy": -31.87090301513672, + "objective/kl": 12.704984664916992, + "objective/non_score_reward": -0.06352491676807404, + "objective/rlhf_reward": 7.6809868812561035, + "objective/scores": 7.744511604309082, + "policy/approxkl_avg": 0.20181894302368164, + "policy/clipfrac_avg": 0.3828125, + "policy/entropy_avg": 0.6681352853775024, + "step": 13, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2607, + "val/ratio": 1.0001722574234009, + "val/ratio_var": 3.1368097097583814e-06 + }, + { + "episode": 7680, + "epoch": 0.03289924605894448, + "eps": 6, + "loss/policy_avg": 0.23149724304676056, + "lr": 2.8359375e-06, + "objective/entropy": -31.151302337646484, + "objective/kl": 14.230264663696289, + "objective/non_score_reward": -0.07115132361650467, + "objective/rlhf_reward": 7.844120025634766, + "objective/scores": 7.915271282196045, + "policy/approxkl_avg": 0.23936033248901367, + "policy/clipfrac_avg": 0.40625, + "policy/entropy_avg": 0.6505059003829956, + "step": 14, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2523, + "val/ratio": 0.999779224395752, + "val/ratio_var": 3.855011527775787e-06 + }, + { + "episode": 8192, + "epoch": 0.03509252912954078, + "eps": 6, + "loss/policy_avg": 0.25306057929992676, + "lr": 2.82421875e-06, + "objective/entropy": -33.95362091064453, + "objective/kl": 15.211322784423828, + "objective/non_score_reward": -0.07605661451816559, + "objective/rlhf_reward": 8.07334041595459, + "objective/scores": 8.149396896362305, + "policy/approxkl_avg": 0.2219444364309311, + "policy/clipfrac_avg": 0.376953125, + "policy/entropy_avg": 0.6534073352813721, + "step": 15, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2392, + "val/ratio": 1.000091791152954, + "val/ratio_var": 3.7099030123499688e-06 + }, + { + "episode": 8704, + "epoch": 0.03728581220013708, + "eps": 6, + "loss/policy_avg": 0.3671952486038208, + "lr": 2.8125e-06, + "objective/entropy": -36.56987762451172, + "objective/kl": 15.896276473999023, + "objective/non_score_reward": -0.07948137074708939, + "objective/rlhf_reward": 8.115487098693848, + "objective/scores": 8.194968223571777, + "policy/approxkl_avg": 0.2001374065876007, + "policy/clipfrac_avg": 0.376953125, + "policy/entropy_avg": 0.5611248016357422, + "step": 16, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2099, + "val/ratio": 1.0001428127288818, + "val/ratio_var": 2.9763129987259163e-06 + }, + { + "episode": 9216, + "epoch": 0.03947909527073338, + "eps": 6, + "loss/policy_avg": 0.2377510964870453, + "lr": 2.80078125e-06, + "objective/entropy": -38.7183837890625, + "objective/kl": 16.948450088500977, + "objective/non_score_reward": -0.08474224805831909, + "objective/rlhf_reward": 8.287287712097168, + "objective/scores": 8.372029304504395, + "policy/approxkl_avg": 0.19774162769317627, + "policy/clipfrac_avg": 0.33984375, + "policy/entropy_avg": 0.5588769912719727, + "step": 17, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2173, + "val/ratio": 0.9999226331710815, + "val/ratio_var": 2.539955858082976e-06 + }, + { + "episode": 9728, + "epoch": 0.041672378341329676, + "eps": 6, + "loss/policy_avg": 0.30208033323287964, + "lr": 2.7890625e-06, + "objective/entropy": -37.54444122314453, + "objective/kl": 17.52410316467285, + "objective/non_score_reward": -0.08762051165103912, + "objective/rlhf_reward": 8.367655754089355, + "objective/scores": 8.455276489257812, + "policy/approxkl_avg": 0.20374038815498352, + "policy/clipfrac_avg": 0.33984375, + "policy/entropy_avg": 0.5387827157974243, + "step": 18, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2168, + "val/ratio": 0.99991774559021, + "val/ratio_var": 3.0660403353977017e-06 + }, + { + "episode": 10240, + "epoch": 0.04386566141192598, + "eps": 6, + "loss/policy_avg": 0.22399874031543732, + "lr": 2.77734375e-06, + "objective/entropy": -39.96161651611328, + "objective/kl": 17.749210357666016, + "objective/non_score_reward": -0.08874605596065521, + "objective/rlhf_reward": 8.257219314575195, + "objective/scores": 8.345966339111328, + "policy/approxkl_avg": 0.2131994068622589, + "policy/clipfrac_avg": 0.359375, + "policy/entropy_avg": 0.5233398079872131, + "step": 19, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2201, + "val/ratio": 1.0001959800720215, + "val/ratio_var": 3.099843979725847e-06 + }, + { + "episode": 10752, + "epoch": 0.04605894448252228, + "eps": 6, + "loss/policy_avg": 0.16000297665596008, + "lr": 2.765625e-06, + "objective/entropy": -40.21980285644531, + "objective/kl": 18.693737030029297, + "objective/non_score_reward": -0.09346868097782135, + "objective/rlhf_reward": 8.49026107788086, + "objective/scores": 8.58372974395752, + "policy/approxkl_avg": 0.2675846219062805, + "policy/clipfrac_avg": 0.38671875, + "policy/entropy_avg": 0.524081826210022, + "step": 20, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2342, + "val/ratio": 0.9995419979095459, + "val/ratio_var": 3.2623988772684243e-06 + }, + { + "episode": 11264, + "epoch": 0.048252227553118573, + "eps": 6, + "loss/policy_avg": 0.23491168022155762, + "lr": 2.75390625e-06, + "objective/entropy": -38.962947845458984, + "objective/kl": 19.98727035522461, + "objective/non_score_reward": -0.099936343729496, + "objective/rlhf_reward": 8.68085765838623, + "objective/scores": 8.780793190002441, + "policy/approxkl_avg": 0.19925493001937866, + "policy/clipfrac_avg": 0.3359375, + "policy/entropy_avg": 0.5088595151901245, + "step": 21, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2385, + "val/ratio": 0.9999128580093384, + "val/ratio_var": 3.195402769051725e-06 + }, + { + "episode": 11776, + "epoch": 0.05044551062371487, + "eps": 6, + "loss/policy_avg": 0.20272764563560486, + "lr": 2.7421875e-06, + "objective/entropy": -39.06739807128906, + "objective/kl": 20.73778533935547, + "objective/non_score_reward": -0.10368892550468445, + "objective/rlhf_reward": 8.66939640045166, + "objective/scores": 8.77308464050293, + "policy/approxkl_avg": 0.24808868765830994, + "policy/clipfrac_avg": 0.3671875, + "policy/entropy_avg": 0.5206277370452881, + "step": 22, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2328, + "val/ratio": 0.9997999668121338, + "val/ratio_var": 2.6692378014558926e-06 + }, + { + "episode": 12288, + "epoch": 0.052638793694311174, + "eps": 6, + "loss/policy_avg": 0.1657373309135437, + "lr": 2.73046875e-06, + "objective/entropy": -41.76697540283203, + "objective/kl": 21.265926361083984, + "objective/non_score_reward": -0.10632962733507156, + "objective/rlhf_reward": 8.899429321289062, + "objective/scores": 9.005760192871094, + "policy/approxkl_avg": 0.21000100672245026, + "policy/clipfrac_avg": 0.330078125, + "policy/entropy_avg": 0.4891219437122345, + "step": 23, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2460, + "val/ratio": 1.000035285949707, + "val/ratio_var": 3.269958597229561e-06 + }, + { + "episode": 12800, + "epoch": 0.05483207676490747, + "eps": 6, + "loss/policy_avg": 0.17451098561286926, + "lr": 2.71875e-06, + "objective/entropy": -40.41400146484375, + "objective/kl": 22.69107437133789, + "objective/non_score_reward": -0.11345535516738892, + "objective/rlhf_reward": 8.97903060913086, + "objective/scores": 9.092485427856445, + "policy/approxkl_avg": 0.2612704336643219, + "policy/clipfrac_avg": 0.37109375, + "policy/entropy_avg": 0.4792313873767853, + "step": 24, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2361, + "val/ratio": 0.9997962713241577, + "val/ratio_var": 4.033691766380798e-06 + }, + { + "episode": 13312, + "epoch": 0.05702535983550377, + "eps": 6, + "loss/policy_avg": 0.3060297966003418, + "lr": 2.70703125e-06, + "objective/entropy": -39.72850036621094, + "objective/kl": 24.020402908325195, + "objective/non_score_reward": -0.12010201811790466, + "objective/rlhf_reward": 8.757214546203613, + "objective/scores": 8.877317428588867, + "policy/approxkl_avg": 0.20275026559829712, + "policy/clipfrac_avg": 0.333984375, + "policy/entropy_avg": 0.47031739354133606, + "step": 25, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2365, + "val/ratio": 1.0004143714904785, + "val/ratio_var": 4.571053977997508e-06 + }, + { + "episode": 13824, + "epoch": 0.05921864290610007, + "eps": 6, + "loss/policy_avg": 0.2459714114665985, + "lr": 2.6953125e-06, + "objective/entropy": -42.67210388183594, + "objective/kl": 24.816062927246094, + "objective/non_score_reward": -0.12408032268285751, + "objective/rlhf_reward": 8.767488479614258, + "objective/scores": 8.891569137573242, + "policy/approxkl_avg": 0.16891685128211975, + "policy/clipfrac_avg": 0.314453125, + "policy/entropy_avg": 0.4268378019332886, + "step": 26, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2338, + "val/ratio": 1.0001232624053955, + "val/ratio_var": 2.055706318060402e-06 + }, + { + "episode": 14336, + "epoch": 0.06141192597669637, + "eps": 6, + "loss/policy_avg": 0.2477143555879593, + "lr": 2.68359375e-06, + "objective/entropy": -40.98384094238281, + "objective/kl": 26.158355712890625, + "objective/non_score_reward": -0.1307917833328247, + "objective/rlhf_reward": 8.869270324707031, + "objective/scores": 9.000061988830566, + "policy/approxkl_avg": 0.21648374199867249, + "policy/clipfrac_avg": 0.35546875, + "policy/entropy_avg": 0.43942803144454956, + "step": 27, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2325, + "val/ratio": 1.00002920627594, + "val/ratio_var": 3.6563492358254734e-06 + }, + { + "episode": 14848, + "epoch": 0.06360520904729267, + "eps": 6, + "loss/policy_avg": 0.18666958808898926, + "lr": 2.671875e-06, + "objective/entropy": -39.86817169189453, + "objective/kl": 26.283226013183594, + "objective/non_score_reward": -0.13141612708568573, + "objective/rlhf_reward": 9.001875877380371, + "objective/scores": 9.133292198181152, + "policy/approxkl_avg": 0.22604086995124817, + "policy/clipfrac_avg": 0.37109375, + "policy/entropy_avg": 0.4275825023651123, + "step": 28, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2388, + "val/ratio": 0.999720573425293, + "val/ratio_var": 2.5127403660007985e-06 + }, + { + "episode": 15360, + "epoch": 0.06579849211788896, + "eps": 6, + "loss/policy_avg": 0.21501849591732025, + "lr": 2.66015625e-06, + "objective/entropy": -34.56945037841797, + "objective/kl": 28.54375457763672, + "objective/non_score_reward": -0.14271876215934753, + "objective/rlhf_reward": 8.915486335754395, + "objective/scores": 9.058204650878906, + "policy/approxkl_avg": 0.23621705174446106, + "policy/clipfrac_avg": 0.361328125, + "policy/entropy_avg": 0.42039746046066284, + "step": 29, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2389, + "val/ratio": 0.9995044469833374, + "val/ratio_var": 4.271460511517944e-06 + }, + { + "episode": 15872, + "epoch": 0.06799177518848526, + "eps": 6, + "loss/policy_avg": 0.343304842710495, + "lr": 2.6484375e-06, + "objective/entropy": -35.42715072631836, + "objective/kl": 28.722532272338867, + "objective/non_score_reward": -0.14361265301704407, + "objective/rlhf_reward": 9.159289360046387, + "objective/scores": 9.302902221679688, + "policy/approxkl_avg": 0.221934974193573, + "policy/clipfrac_avg": 0.3046875, + "policy/entropy_avg": 0.3945683240890503, + "step": 30, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2236, + "val/ratio": 1.0000040531158447, + "val/ratio_var": 3.5196928820369067e-06 + }, + { + "episode": 16384, + "epoch": 0.07018505825908156, + "eps": 6, + "loss/policy_avg": 0.4800252318382263, + "lr": 2.63671875e-06, + "objective/entropy": -33.33995056152344, + "objective/kl": 29.83435821533203, + "objective/non_score_reward": -0.14917179942131042, + "objective/rlhf_reward": 9.096776962280273, + "objective/scores": 9.245948791503906, + "policy/approxkl_avg": 0.267780601978302, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.401678204536438, + "step": 31, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2179, + "val/ratio": 1.0002527236938477, + "val/ratio_var": 3.2502473459317116e-06 + }, + { + "episode": 16896, + "epoch": 0.07237834132967787, + "eps": 6, + "loss/policy_avg": 0.2809556722640991, + "lr": 2.6250000000000003e-06, + "objective/entropy": -34.06169509887695, + "objective/kl": 31.27130889892578, + "objective/non_score_reward": -0.156356543302536, + "objective/rlhf_reward": 9.190299034118652, + "objective/scores": 9.346654891967773, + "policy/approxkl_avg": 0.2590065896511078, + "policy/clipfrac_avg": 0.333984375, + "policy/entropy_avg": 0.39545533061027527, + "step": 32, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2267, + "val/ratio": 1.0000057220458984, + "val/ratio_var": 3.7364695799624315e-06 + }, + { + "episode": 17408, + "epoch": 0.07457162440027416, + "eps": 6, + "loss/policy_avg": 0.22726668417453766, + "lr": 2.61328125e-06, + "objective/entropy": -29.238605499267578, + "objective/kl": 32.63374710083008, + "objective/non_score_reward": -0.16316872835159302, + "objective/rlhf_reward": 9.312760353088379, + "objective/scores": 9.475929260253906, + "policy/approxkl_avg": 0.25571638345718384, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.38992634415626526, + "step": 33, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2153, + "val/ratio": 0.9997001886367798, + "val/ratio_var": 2.696072442631703e-06 + }, + { + "episode": 17920, + "epoch": 0.07676490747087046, + "eps": 6, + "loss/policy_avg": 0.31760329008102417, + "lr": 2.6015625e-06, + "objective/entropy": -30.36035919189453, + "objective/kl": 33.105316162109375, + "objective/non_score_reward": -0.16552656888961792, + "objective/rlhf_reward": 9.344242095947266, + "objective/scores": 9.509769439697266, + "policy/approxkl_avg": 0.26468268036842346, + "policy/clipfrac_avg": 0.31640625, + "policy/entropy_avg": 0.3962373435497284, + "step": 34, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2063, + "val/ratio": 0.9997377395629883, + "val/ratio_var": 3.3684978006931487e-06 + }, + { + "episode": 18432, + "epoch": 0.07895819054146676, + "eps": 6, + "loss/policy_avg": 0.29205620288848877, + "lr": 2.5898437500000003e-06, + "objective/entropy": -31.297311782836914, + "objective/kl": 34.06657409667969, + "objective/non_score_reward": -0.1703328639268875, + "objective/rlhf_reward": 9.36227035522461, + "objective/scores": 9.532604217529297, + "policy/approxkl_avg": 0.23790021240711212, + "policy/clipfrac_avg": 0.359375, + "policy/entropy_avg": 0.39074409008026123, + "step": 35, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2030, + "val/ratio": 1.0001909732818604, + "val/ratio_var": 4.307842573325615e-06 + }, + { + "episode": 18944, + "epoch": 0.08115147361206305, + "eps": 6, + "loss/policy_avg": 0.27079224586486816, + "lr": 2.578125e-06, + "objective/entropy": -31.059741973876953, + "objective/kl": 34.23755645751953, + "objective/non_score_reward": -0.17118775844573975, + "objective/rlhf_reward": 9.575616836547852, + "objective/scores": 9.746803283691406, + "policy/approxkl_avg": 0.2780650556087494, + "policy/clipfrac_avg": 0.359375, + "policy/entropy_avg": 0.36144694685935974, + "step": 36, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1946, + "val/ratio": 1.0000677108764648, + "val/ratio_var": 5.890160991839366e-06 + }, + { + "episode": 19456, + "epoch": 0.08334475668265935, + "eps": 6, + "loss/policy_avg": 0.35481345653533936, + "lr": 2.56640625e-06, + "objective/entropy": -29.504077911376953, + "objective/kl": 35.572723388671875, + "objective/non_score_reward": -0.17786362767219543, + "objective/rlhf_reward": 9.650212287902832, + "objective/scores": 9.828075408935547, + "policy/approxkl_avg": 0.27989301085472107, + "policy/clipfrac_avg": 0.341796875, + "policy/entropy_avg": 0.36728498339653015, + "step": 37, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1913, + "val/ratio": 1.000013828277588, + "val/ratio_var": 3.834660674328916e-06 + }, + { + "episode": 19968, + "epoch": 0.08553803975325565, + "eps": 6, + "loss/policy_avg": 0.32023417949676514, + "lr": 2.5546875000000003e-06, + "objective/entropy": -32.94467544555664, + "objective/kl": 35.05192565917969, + "objective/non_score_reward": -0.17525961995124817, + "objective/rlhf_reward": 9.715032577514648, + "objective/scores": 9.89029312133789, + "policy/approxkl_avg": 0.28202196955680847, + "policy/clipfrac_avg": 0.32421875, + "policy/entropy_avg": 0.3595014214515686, + "step": 38, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1895, + "val/ratio": 1.0004128217697144, + "val/ratio_var": 3.976154857809888e-06 + }, + { + "episode": 20480, + "epoch": 0.08773132282385196, + "eps": 6, + "loss/policy_avg": 0.3284844160079956, + "lr": 2.54296875e-06, + "objective/entropy": -33.950965881347656, + "objective/kl": 35.06300354003906, + "objective/non_score_reward": -0.1753150224685669, + "objective/rlhf_reward": 9.614059448242188, + "objective/scores": 9.789375305175781, + "policy/approxkl_avg": 0.2808968126773834, + "policy/clipfrac_avg": 0.3359375, + "policy/entropy_avg": 0.3299378454685211, + "step": 39, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1822, + "val/ratio": 1.0001529455184937, + "val/ratio_var": 3.939061116398079e-06 + }, + { + "episode": 20992, + "epoch": 0.08992460589444826, + "eps": 6, + "loss/policy_avg": 0.25570446252822876, + "lr": 2.53125e-06, + "objective/entropy": -33.10383605957031, + "objective/kl": 37.40079116821289, + "objective/non_score_reward": -0.1870039403438568, + "objective/rlhf_reward": 9.649069786071777, + "objective/scores": 9.836073875427246, + "policy/approxkl_avg": 0.28119906783103943, + "policy/clipfrac_avg": 0.349609375, + "policy/entropy_avg": 0.33145490288734436, + "step": 40, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1757, + "val/ratio": 0.999976396560669, + "val/ratio_var": 3.935067525162594e-06 + }, + { + "episode": 21504, + "epoch": 0.09211788896504455, + "eps": 6, + "loss/policy_avg": 0.3803982436656952, + "lr": 2.5195312500000003e-06, + "objective/entropy": -35.517391204833984, + "objective/kl": 35.55995178222656, + "objective/non_score_reward": -0.1777997612953186, + "objective/rlhf_reward": 9.623468399047852, + "objective/scores": 9.801267623901367, + "policy/approxkl_avg": 0.28167036175727844, + "policy/clipfrac_avg": 0.3828125, + "policy/entropy_avg": 0.33138006925582886, + "step": 41, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1765, + "val/ratio": 1.000349760055542, + "val/ratio_var": 4.55302324553486e-06 + }, + { + "episode": 22016, + "epoch": 0.09431117203564085, + "eps": 6, + "loss/policy_avg": 0.2066161185503006, + "lr": 2.5078125e-06, + "objective/entropy": -34.937923431396484, + "objective/kl": 34.828224182128906, + "objective/non_score_reward": -0.17414110898971558, + "objective/rlhf_reward": 10.007875442504883, + "objective/scores": 10.18201732635498, + "policy/approxkl_avg": 0.25226256251335144, + "policy/clipfrac_avg": 0.349609375, + "policy/entropy_avg": 0.31548449397087097, + "step": 42, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1692, + "val/ratio": 1.0003182888031006, + "val/ratio_var": 3.0859027901897207e-06 + }, + { + "episode": 22528, + "epoch": 0.09650445510623715, + "eps": 6, + "loss/policy_avg": 0.21718214452266693, + "lr": 2.49609375e-06, + "objective/entropy": -36.30521774291992, + "objective/kl": 35.64817810058594, + "objective/non_score_reward": -0.17824086546897888, + "objective/rlhf_reward": 9.524388313293457, + "objective/scores": 9.702629089355469, + "policy/approxkl_avg": 0.3034236431121826, + "policy/clipfrac_avg": 0.359375, + "policy/entropy_avg": 0.333159863948822, + "step": 43, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1736, + "val/ratio": 0.9996454119682312, + "val/ratio_var": 3.5439479688648134e-06 + }, + { + "episode": 23040, + "epoch": 0.09869773817683344, + "eps": 6, + "loss/policy_avg": 0.20844724774360657, + "lr": 2.4843750000000002e-06, + "objective/entropy": -37.294647216796875, + "objective/kl": 34.449188232421875, + "objective/non_score_reward": -0.17224593460559845, + "objective/rlhf_reward": 9.562028884887695, + "objective/scores": 9.734274864196777, + "policy/approxkl_avg": 0.2691035270690918, + "policy/clipfrac_avg": 0.369140625, + "policy/entropy_avg": 0.3254753351211548, + "step": 44, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1712, + "val/ratio": 0.9999078512191772, + "val/ratio_var": 3.873953573929612e-06 + }, + { + "episode": 23552, + "epoch": 0.10089102124742974, + "eps": 6, + "loss/policy_avg": 0.26617997884750366, + "lr": 2.47265625e-06, + "objective/entropy": -38.45973205566406, + "objective/kl": 33.36463165283203, + "objective/non_score_reward": -0.166823148727417, + "objective/rlhf_reward": 9.529053688049316, + "objective/scores": 9.695877075195312, + "policy/approxkl_avg": 0.2848384976387024, + "policy/clipfrac_avg": 0.32421875, + "policy/entropy_avg": 0.33362650871276855, + "step": 45, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1776, + "val/ratio": 0.9998953938484192, + "val/ratio_var": 4.734669346362352e-06 + }, + { + "episode": 24064, + "epoch": 0.10308430431802605, + "eps": 6, + "loss/policy_avg": 0.14936941862106323, + "lr": 2.4609375e-06, + "objective/entropy": -41.61138153076172, + "objective/kl": 32.80962371826172, + "objective/non_score_reward": -0.16404810547828674, + "objective/rlhf_reward": 9.842905044555664, + "objective/scores": 10.006953239440918, + "policy/approxkl_avg": 0.237547367811203, + "policy/clipfrac_avg": 0.376953125, + "policy/entropy_avg": 0.311695396900177, + "step": 46, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1758, + "val/ratio": 1.0000953674316406, + "val/ratio_var": 2.808139925036812e-06 + }, + { + "episode": 24576, + "epoch": 0.10527758738862235, + "eps": 6, + "loss/policy_avg": 0.30603936314582825, + "lr": 2.4492187500000002e-06, + "objective/entropy": -41.17787551879883, + "objective/kl": 32.624759674072266, + "objective/non_score_reward": -0.16312378644943237, + "objective/rlhf_reward": 9.724346160888672, + "objective/scores": 9.887470245361328, + "policy/approxkl_avg": 0.29364651441574097, + "policy/clipfrac_avg": 0.353515625, + "policy/entropy_avg": 0.3342776894569397, + "step": 47, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1791, + "val/ratio": 1.000326156616211, + "val/ratio_var": 4.5322440200834535e-06 + }, + { + "episode": 25088, + "epoch": 0.10747087045921865, + "eps": 6, + "loss/policy_avg": 0.2642097771167755, + "lr": 2.4375e-06, + "objective/entropy": -39.200443267822266, + "objective/kl": 32.45161437988281, + "objective/non_score_reward": -0.1622580885887146, + "objective/rlhf_reward": 9.750046730041504, + "objective/scores": 9.912304878234863, + "policy/approxkl_avg": 0.30972546339035034, + "policy/clipfrac_avg": 0.345703125, + "policy/entropy_avg": 0.33516496419906616, + "step": 48, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1872, + "val/ratio": 1.0000823736190796, + "val/ratio_var": 3.669098759928602e-06 + }, + { + "episode": 25600, + "epoch": 0.10966415352981494, + "eps": 6, + "loss/policy_avg": 0.2650497555732727, + "lr": 2.42578125e-06, + "objective/entropy": -42.08147430419922, + "objective/kl": 32.27870178222656, + "objective/non_score_reward": -0.16139349341392517, + "objective/rlhf_reward": 9.627498626708984, + "objective/scores": 9.788891792297363, + "policy/approxkl_avg": 0.24272559583187103, + "policy/clipfrac_avg": 0.34375, + "policy/entropy_avg": 0.3419151306152344, + "step": 49, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1885, + "val/ratio": 1.0000895261764526, + "val/ratio_var": 3.1425495308212703e-06 + }, + { + "episode": 26112, + "epoch": 0.11185743660041124, + "eps": 6, + "loss/policy_avg": 0.19408683478832245, + "lr": 2.4140625000000002e-06, + "objective/entropy": -40.94256591796875, + "objective/kl": 31.59925651550293, + "objective/non_score_reward": -0.1579962968826294, + "objective/rlhf_reward": 9.357413291931152, + "objective/scores": 9.515409469604492, + "policy/approxkl_avg": 0.2539810538291931, + "policy/clipfrac_avg": 0.345703125, + "policy/entropy_avg": 0.33615610003471375, + "step": 50, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1861, + "val/ratio": 1.0000097751617432, + "val/ratio_var": 4.071615421707975e-06 + }, + { + "episode": 26624, + "epoch": 0.11405071967100754, + "eps": 6, + "loss/policy_avg": 0.12506979703903198, + "lr": 2.40234375e-06, + "objective/entropy": -43.869224548339844, + "objective/kl": 30.23246955871582, + "objective/non_score_reward": -0.15116234123706818, + "objective/rlhf_reward": 9.635049819946289, + "objective/scores": 9.786212921142578, + "policy/approxkl_avg": 0.22058835625648499, + "policy/clipfrac_avg": 0.3828125, + "policy/entropy_avg": 0.35689815878868103, + "step": 51, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1916, + "val/ratio": 1.0003091096878052, + "val/ratio_var": 3.508681174935191e-06 + }, + { + "episode": 27136, + "epoch": 0.11624400274160383, + "eps": 6, + "loss/policy_avg": 0.2060900628566742, + "lr": 2.390625e-06, + "objective/entropy": -45.10155487060547, + "objective/kl": 30.40532684326172, + "objective/non_score_reward": -0.15202662348747253, + "objective/rlhf_reward": 9.444595336914062, + "objective/scores": 9.596622467041016, + "policy/approxkl_avg": 0.2469383329153061, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.33809155225753784, + "step": 52, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1910, + "val/ratio": 0.9997259974479675, + "val/ratio_var": 3.441498392930953e-06 + }, + { + "episode": 27648, + "epoch": 0.11843728581220014, + "eps": 6, + "loss/policy_avg": 0.19970384240150452, + "lr": 2.3789062500000002e-06, + "objective/entropy": -43.50267028808594, + "objective/kl": 29.95575714111328, + "objective/non_score_reward": -0.14977876842021942, + "objective/rlhf_reward": 9.758223533630371, + "objective/scores": 9.908000946044922, + "policy/approxkl_avg": 0.23129594326019287, + "policy/clipfrac_avg": 0.34765625, + "policy/entropy_avg": 0.3461906909942627, + "step": 53, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1897, + "val/ratio": 1.00007164478302, + "val/ratio_var": 4.8727483772381674e-06 + }, + { + "episode": 28160, + "epoch": 0.12063056888279644, + "eps": 6, + "loss/policy_avg": 0.17737600207328796, + "lr": 2.3671875e-06, + "objective/entropy": -43.82114791870117, + "objective/kl": 30.448320388793945, + "objective/non_score_reward": -0.15224160254001617, + "objective/rlhf_reward": 9.586687088012695, + "objective/scores": 9.738929748535156, + "policy/approxkl_avg": 0.2304307222366333, + "policy/clipfrac_avg": 0.38671875, + "policy/entropy_avg": 0.35419148206710815, + "step": 54, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1930, + "val/ratio": 1.000131368637085, + "val/ratio_var": 2.835533905454213e-06 + }, + { + "episode": 28672, + "epoch": 0.12282385195339274, + "eps": 6, + "loss/policy_avg": 0.20781438052654266, + "lr": 2.35546875e-06, + "objective/entropy": -42.808937072753906, + "objective/kl": 30.716829299926758, + "objective/non_score_reward": -0.15358413755893707, + "objective/rlhf_reward": 9.48070240020752, + "objective/scores": 9.634286880493164, + "policy/approxkl_avg": 0.2257956713438034, + "policy/clipfrac_avg": 0.34375, + "policy/entropy_avg": 0.35503190755844116, + "step": 55, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1936, + "val/ratio": 0.9999227523803711, + "val/ratio_var": 3.047591235372238e-06 + }, + { + "episode": 29184, + "epoch": 0.12501713502398903, + "eps": 6, + "loss/policy_avg": 0.17236250638961792, + "lr": 2.3437500000000002e-06, + "objective/entropy": -42.00170135498047, + "objective/kl": 30.85394859313965, + "objective/non_score_reward": -0.154269739985466, + "objective/rlhf_reward": 9.769851684570312, + "objective/scores": 9.924121856689453, + "policy/approxkl_avg": 0.25827232003211975, + "policy/clipfrac_avg": 0.365234375, + "policy/entropy_avg": 0.35941898822784424, + "step": 56, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1899, + "val/ratio": 0.9998503923416138, + "val/ratio_var": 3.5402435969444923e-06 + }, + { + "episode": 29696, + "epoch": 0.12721041809458533, + "eps": 6, + "loss/policy_avg": 0.1874779909849167, + "lr": 2.33203125e-06, + "objective/entropy": -41.37017822265625, + "objective/kl": 30.405405044555664, + "objective/non_score_reward": -0.15202701091766357, + "objective/rlhf_reward": 9.552864074707031, + "objective/scores": 9.704890251159668, + "policy/approxkl_avg": 0.2579526901245117, + "policy/clipfrac_avg": 0.3671875, + "policy/entropy_avg": 0.36883309483528137, + "step": 57, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1996, + "val/ratio": 1.0000630617141724, + "val/ratio_var": 3.2543882753088837e-06 + }, + { + "episode": 30208, + "epoch": 0.12940370116518163, + "eps": 6, + "loss/policy_avg": 0.24861405789852142, + "lr": 2.3203125e-06, + "objective/entropy": -39.66204833984375, + "objective/kl": 32.23133850097656, + "objective/non_score_reward": -0.16115668416023254, + "objective/rlhf_reward": 9.291080474853516, + "objective/scores": 9.452238082885742, + "policy/approxkl_avg": 0.24864479899406433, + "policy/clipfrac_avg": 0.34765625, + "policy/entropy_avg": 0.39884617924690247, + "step": 58, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1956, + "val/ratio": 0.9995733499526978, + "val/ratio_var": 2.4565586045355303e-06 + }, + { + "episode": 30720, + "epoch": 0.13159698423577793, + "eps": 6, + "loss/policy_avg": 0.15614402294158936, + "lr": 2.30859375e-06, + "objective/entropy": -39.843353271484375, + "objective/kl": 31.747220993041992, + "objective/non_score_reward": -0.15873610973358154, + "objective/rlhf_reward": 9.32120418548584, + "objective/scores": 9.479940414428711, + "policy/approxkl_avg": 0.2175564169883728, + "policy/clipfrac_avg": 0.375, + "policy/entropy_avg": 0.40173786878585815, + "step": 59, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1870, + "val/ratio": 1.0000216960906982, + "val/ratio_var": 2.7907508410862647e-06 + }, + { + "episode": 31232, + "epoch": 0.13379026730637422, + "eps": 6, + "loss/policy_avg": 0.1781693994998932, + "lr": 2.296875e-06, + "objective/entropy": -39.122676849365234, + "objective/kl": 32.07661437988281, + "objective/non_score_reward": -0.16038307547569275, + "objective/rlhf_reward": 9.458475112915039, + "objective/scores": 9.618858337402344, + "policy/approxkl_avg": 0.2337334156036377, + "policy/clipfrac_avg": 0.3828125, + "policy/entropy_avg": 0.40583640336990356, + "step": 60, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1988, + "val/ratio": 1.000226378440857, + "val/ratio_var": 2.6703253297455376e-06 + }, + { + "episode": 31744, + "epoch": 0.13598355037697052, + "eps": 6, + "loss/policy_avg": 0.22470299899578094, + "lr": 2.28515625e-06, + "objective/entropy": -35.38658905029297, + "objective/kl": 33.76303482055664, + "objective/non_score_reward": -0.16881518065929413, + "objective/rlhf_reward": 9.489850044250488, + "objective/scores": 9.658665657043457, + "policy/approxkl_avg": 0.2829976975917816, + "policy/clipfrac_avg": 0.359375, + "policy/entropy_avg": 0.40766507387161255, + "step": 61, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1899, + "val/ratio": 1.0002920627593994, + "val/ratio_var": 4.702184469351778e-06 + }, + { + "episode": 32256, + "epoch": 0.13817683344756682, + "eps": 6, + "loss/policy_avg": 0.18476197123527527, + "lr": 2.2734375e-06, + "objective/entropy": -35.610679626464844, + "objective/kl": 32.4565544128418, + "objective/non_score_reward": -0.1622827649116516, + "objective/rlhf_reward": 9.31367015838623, + "objective/scores": 9.475953102111816, + "policy/approxkl_avg": 0.2603687644004822, + "policy/clipfrac_avg": 0.375, + "policy/entropy_avg": 0.4185265302658081, + "step": 62, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1909, + "val/ratio": 0.9998178482055664, + "val/ratio_var": 3.281563522250508e-06 + }, + { + "episode": 32768, + "epoch": 0.1403701165181631, + "eps": 6, + "loss/policy_avg": 0.26672354340553284, + "lr": 2.26171875e-06, + "objective/entropy": -34.83911895751953, + "objective/kl": 33.32848358154297, + "objective/non_score_reward": -0.166642427444458, + "objective/rlhf_reward": 9.365612030029297, + "objective/scores": 9.532255172729492, + "policy/approxkl_avg": 0.21640878915786743, + "policy/clipfrac_avg": 0.35546875, + "policy/entropy_avg": 0.4103761911392212, + "step": 63, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1924, + "val/ratio": 1.000182867050171, + "val/ratio_var": 2.308673174411524e-06 + }, + { + "episode": 33280, + "epoch": 0.14256339958875944, + "eps": 6, + "loss/policy_avg": 0.22516316175460815, + "lr": 2.25e-06, + "objective/entropy": -35.21662902832031, + "objective/kl": 33.290374755859375, + "objective/non_score_reward": -0.16645187139511108, + "objective/rlhf_reward": 9.381125450134277, + "objective/scores": 9.547576904296875, + "policy/approxkl_avg": 0.3286188542842865, + "policy/clipfrac_avg": 0.357421875, + "policy/entropy_avg": 0.41928941011428833, + "step": 64, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1964, + "val/ratio": 1.000014066696167, + "val/ratio_var": 3.4851202599384123e-06 + }, + { + "episode": 33792, + "epoch": 0.14475668265935573, + "eps": 6, + "loss/policy_avg": 0.18076206743717194, + "lr": 2.23828125e-06, + "objective/entropy": -37.27294921875, + "objective/kl": 32.67399978637695, + "objective/non_score_reward": -0.16336998343467712, + "objective/rlhf_reward": 9.350773811340332, + "objective/scores": 9.514143943786621, + "policy/approxkl_avg": 0.23823928833007812, + "policy/clipfrac_avg": 0.388671875, + "policy/entropy_avg": 0.4244874119758606, + "step": 65, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1935, + "val/ratio": 0.9997513294219971, + "val/ratio_var": 3.028096216439735e-06 + }, + { + "episode": 34304, + "epoch": 0.14694996572995203, + "eps": 6, + "loss/policy_avg": 0.17251071333885193, + "lr": 2.2265625e-06, + "objective/entropy": -33.87648010253906, + "objective/kl": 33.30876159667969, + "objective/non_score_reward": -0.16654381155967712, + "objective/rlhf_reward": 9.384592056274414, + "objective/scores": 9.551136016845703, + "policy/approxkl_avg": 0.25553739070892334, + "policy/clipfrac_avg": 0.369140625, + "policy/entropy_avg": 0.450764536857605, + "step": 66, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1977, + "val/ratio": 0.9999786019325256, + "val/ratio_var": 3.189375547663076e-06 + }, + { + "episode": 34816, + "epoch": 0.14914324880054833, + "eps": 6, + "loss/policy_avg": 0.2089308649301529, + "lr": 2.21484375e-06, + "objective/entropy": -34.71873092651367, + "objective/kl": 32.921295166015625, + "objective/non_score_reward": -0.1646064817905426, + "objective/rlhf_reward": 9.405241966247559, + "objective/scores": 9.56984806060791, + "policy/approxkl_avg": 0.24852168560028076, + "policy/clipfrac_avg": 0.369140625, + "policy/entropy_avg": 0.4264836013317108, + "step": 67, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2023, + "val/ratio": 0.9999498724937439, + "val/ratio_var": 3.718509105965495e-06 + }, + { + "episode": 35328, + "epoch": 0.15133653187114463, + "eps": 6, + "loss/policy_avg": 0.30707770586013794, + "lr": 2.203125e-06, + "objective/entropy": -33.18562698364258, + "objective/kl": 34.398094177246094, + "objective/non_score_reward": -0.17199048399925232, + "objective/rlhf_reward": 9.318955421447754, + "objective/scores": 9.490945816040039, + "policy/approxkl_avg": 0.2854606509208679, + "policy/clipfrac_avg": 0.35546875, + "policy/entropy_avg": 0.43861570954322815, + "step": 68, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2134, + "val/ratio": 1.0002244710922241, + "val/ratio_var": 3.3594094475120073e-06 + }, + { + "episode": 35840, + "epoch": 0.15352981494174092, + "eps": 6, + "loss/policy_avg": 0.21776500344276428, + "lr": 2.19140625e-06, + "objective/entropy": -33.90901565551758, + "objective/kl": 33.42001724243164, + "objective/non_score_reward": -0.16710007190704346, + "objective/rlhf_reward": 9.314750671386719, + "objective/scores": 9.481850624084473, + "policy/approxkl_avg": 0.27862101793289185, + "policy/clipfrac_avg": 0.318359375, + "policy/entropy_avg": 0.4583716094493866, + "step": 69, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2177, + "val/ratio": 1.0001673698425293, + "val/ratio_var": 3.758811544685159e-06 + }, + { + "episode": 36352, + "epoch": 0.15572309801233722, + "eps": 6, + "loss/policy_avg": 0.2049793154001236, + "lr": 2.1796875e-06, + "objective/entropy": -33.58552169799805, + "objective/kl": 33.38475799560547, + "objective/non_score_reward": -0.16692380607128143, + "objective/rlhf_reward": 9.471214294433594, + "objective/scores": 9.638137817382812, + "policy/approxkl_avg": 0.23455402255058289, + "policy/clipfrac_avg": 0.353515625, + "policy/entropy_avg": 0.4269852638244629, + "step": 70, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2168, + "val/ratio": 1.0000548362731934, + "val/ratio_var": 3.26197096001124e-06 + }, + { + "episode": 36864, + "epoch": 0.15791638108293352, + "eps": 6, + "loss/policy_avg": 0.2173652946949005, + "lr": 2.16796875e-06, + "objective/entropy": -32.59899139404297, + "objective/kl": 34.50146484375, + "objective/non_score_reward": -0.17250731587409973, + "objective/rlhf_reward": 9.358166694641113, + "objective/scores": 9.53067398071289, + "policy/approxkl_avg": 0.26884737610816956, + "policy/clipfrac_avg": 0.3828125, + "policy/entropy_avg": 0.4478238523006439, + "step": 71, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2152, + "val/ratio": 0.9999924302101135, + "val/ratio_var": 3.7096308460604632e-06 + }, + { + "episode": 37376, + "epoch": 0.1601096641535298, + "eps": 6, + "loss/policy_avg": 0.1738312691450119, + "lr": 2.15625e-06, + "objective/entropy": -33.73279571533203, + "objective/kl": 34.007720947265625, + "objective/non_score_reward": -0.1700385957956314, + "objective/rlhf_reward": 9.712380409240723, + "objective/scores": 9.88241958618164, + "policy/approxkl_avg": 0.27387821674346924, + "policy/clipfrac_avg": 0.365234375, + "policy/entropy_avg": 0.44403284788131714, + "step": 72, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2140, + "val/ratio": 0.9999028444290161, + "val/ratio_var": 4.58646536571905e-06 + }, + { + "episode": 37888, + "epoch": 0.1623029472241261, + "eps": 6, + "loss/policy_avg": 0.16620802879333496, + "lr": 2.14453125e-06, + "objective/entropy": -35.81139373779297, + "objective/kl": 32.9052734375, + "objective/non_score_reward": -0.1645263433456421, + "objective/rlhf_reward": 9.479964256286621, + "objective/scores": 9.644491195678711, + "policy/approxkl_avg": 0.25569581985473633, + "policy/clipfrac_avg": 0.3984375, + "policy/entropy_avg": 0.42350655794143677, + "step": 73, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2139, + "val/ratio": 0.9998481273651123, + "val/ratio_var": 3.5795840176433558e-06 + }, + { + "episode": 38400, + "epoch": 0.1644962302947224, + "eps": 6, + "loss/policy_avg": 0.22138690948486328, + "lr": 2.1328125e-06, + "objective/entropy": -34.88341522216797, + "objective/kl": 33.08055877685547, + "objective/non_score_reward": -0.16540278494358063, + "objective/rlhf_reward": 9.436692237854004, + "objective/scores": 9.602094650268555, + "policy/approxkl_avg": 0.21768277883529663, + "policy/clipfrac_avg": 0.33203125, + "policy/entropy_avg": 0.43899866938591003, + "step": 74, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2290, + "val/ratio": 1.000012993812561, + "val/ratio_var": 2.8327724521659547e-06 + }, + { + "episode": 38912, + "epoch": 0.1666895133653187, + "eps": 6, + "loss/policy_avg": 0.17552754282951355, + "lr": 2.12109375e-06, + "objective/entropy": -32.78399658203125, + "objective/kl": 33.330780029296875, + "objective/non_score_reward": -0.16665390133857727, + "objective/rlhf_reward": 9.520395278930664, + "objective/scores": 9.687049865722656, + "policy/approxkl_avg": 0.24098150432109833, + "policy/clipfrac_avg": 0.396484375, + "policy/entropy_avg": 0.43451428413391113, + "step": 75, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2224, + "val/ratio": 1.0003948211669922, + "val/ratio_var": 3.863194706354989e-06 + }, + { + "episode": 39424, + "epoch": 0.168882796435915, + "eps": 6, + "loss/policy_avg": 0.16361400485038757, + "lr": 2.109375e-06, + "objective/entropy": -36.86976623535156, + "objective/kl": 32.36768341064453, + "objective/non_score_reward": -0.16183842718601227, + "objective/rlhf_reward": 9.400718688964844, + "objective/scores": 9.562556266784668, + "policy/approxkl_avg": 0.2723867893218994, + "policy/clipfrac_avg": 0.353515625, + "policy/entropy_avg": 0.4423583149909973, + "step": 76, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2366, + "val/ratio": 0.9998538494110107, + "val/ratio_var": 3.881733846355928e-06 + }, + { + "episode": 39936, + "epoch": 0.1710760795065113, + "eps": 6, + "loss/policy_avg": 0.20113158226013184, + "lr": 2.09765625e-06, + "objective/entropy": -36.3262939453125, + "objective/kl": 31.993881225585938, + "objective/non_score_reward": -0.15996940433979034, + "objective/rlhf_reward": 9.591435432434082, + "objective/scores": 9.751405715942383, + "policy/approxkl_avg": 0.21415743231773376, + "policy/clipfrac_avg": 0.330078125, + "policy/entropy_avg": 0.417125403881073, + "step": 77, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2304, + "val/ratio": 1.0003247261047363, + "val/ratio_var": 3.1106928872759454e-06 + }, + { + "episode": 40448, + "epoch": 0.17326936257710762, + "eps": 6, + "loss/policy_avg": 0.19219069182872772, + "lr": 2.0859375e-06, + "objective/entropy": -35.67451858520508, + "objective/kl": 32.77976989746094, + "objective/non_score_reward": -0.16389885544776917, + "objective/rlhf_reward": 9.648540496826172, + "objective/scores": 9.81243896484375, + "policy/approxkl_avg": 0.24051856994628906, + "policy/clipfrac_avg": 0.357421875, + "policy/entropy_avg": 0.39866721630096436, + "step": 78, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2242, + "val/ratio": 0.9996334910392761, + "val/ratio_var": 2.991866267620935e-06 + }, + { + "episode": 40960, + "epoch": 0.17546264564770392, + "eps": 6, + "loss/policy_avg": 0.22769129276275635, + "lr": 2.07421875e-06, + "objective/entropy": -33.94448471069336, + "objective/kl": 33.3216552734375, + "objective/non_score_reward": -0.1666082888841629, + "objective/rlhf_reward": 9.335718154907227, + "objective/scores": 9.502326965332031, + "policy/approxkl_avg": 0.2864202558994293, + "policy/clipfrac_avg": 0.3515625, + "policy/entropy_avg": 0.4178268313407898, + "step": 79, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2216, + "val/ratio": 1.0000176429748535, + "val/ratio_var": 2.602319455036195e-06 + }, + { + "episode": 41472, + "epoch": 0.17765592871830022, + "eps": 6, + "loss/policy_avg": 0.1986619234085083, + "lr": 2.0625e-06, + "objective/entropy": -36.479896545410156, + "objective/kl": 33.37843322753906, + "objective/non_score_reward": -0.1668921709060669, + "objective/rlhf_reward": 9.689064025878906, + "objective/scores": 9.855956077575684, + "policy/approxkl_avg": 0.2828975319862366, + "policy/clipfrac_avg": 0.369140625, + "policy/entropy_avg": 0.39649999141693115, + "step": 80, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2166, + "val/ratio": 1.0000145435333252, + "val/ratio_var": 3.323544660815969e-06 + }, + { + "episode": 41984, + "epoch": 0.1798492117888965, + "eps": 6, + "loss/policy_avg": 0.36803221702575684, + "lr": 2.05078125e-06, + "objective/entropy": -34.6856803894043, + "objective/kl": 32.34107971191406, + "objective/non_score_reward": -0.1617053896188736, + "objective/rlhf_reward": 9.39497184753418, + "objective/scores": 9.556676864624023, + "policy/approxkl_avg": 0.30969473719596863, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.40510374307632446, + "step": 81, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2162, + "val/ratio": 0.9998123049736023, + "val/ratio_var": 5.09162737216684e-06 + }, + { + "episode": 42496, + "epoch": 0.1820424948594928, + "eps": 6, + "loss/policy_avg": 0.16523130238056183, + "lr": 2.0390625e-06, + "objective/entropy": -36.272003173828125, + "objective/kl": 34.003143310546875, + "objective/non_score_reward": -0.17001570761203766, + "objective/rlhf_reward": 9.380304336547852, + "objective/scores": 9.55031967163086, + "policy/approxkl_avg": 0.24730241298675537, + "policy/clipfrac_avg": 0.345703125, + "policy/entropy_avg": 0.40937918424606323, + "step": 82, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2186, + "val/ratio": 0.999603807926178, + "val/ratio_var": 2.4847686290740967e-06 + }, + { + "episode": 43008, + "epoch": 0.1842357779300891, + "eps": 6, + "loss/policy_avg": 0.2093135118484497, + "lr": 2.02734375e-06, + "objective/entropy": -35.804931640625, + "objective/kl": 33.306732177734375, + "objective/non_score_reward": -0.1665336787700653, + "objective/rlhf_reward": 9.440333366394043, + "objective/scores": 9.606866836547852, + "policy/approxkl_avg": 0.22019661962985992, + "policy/clipfrac_avg": 0.3359375, + "policy/entropy_avg": 0.404262512922287, + "step": 83, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2208, + "val/ratio": 1.000349521636963, + "val/ratio_var": 2.718377345445333e-06 + }, + { + "episode": 43520, + "epoch": 0.1864290610006854, + "eps": 6, + "loss/policy_avg": 0.24589769542217255, + "lr": 2.015625e-06, + "objective/entropy": -35.195167541503906, + "objective/kl": 32.886680603027344, + "objective/non_score_reward": -0.16443338990211487, + "objective/rlhf_reward": 9.737478256225586, + "objective/scores": 9.901910781860352, + "policy/approxkl_avg": 0.224727600812912, + "policy/clipfrac_avg": 0.357421875, + "policy/entropy_avg": 0.3911252021789551, + "step": 84, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2169, + "val/ratio": 1.0001827478408813, + "val/ratio_var": 2.8814699817303335e-06 + }, + { + "episode": 44032, + "epoch": 0.1886223440712817, + "eps": 6, + "loss/policy_avg": 0.2011430561542511, + "lr": 2.00390625e-06, + "objective/entropy": -36.270198822021484, + "objective/kl": 32.86201095581055, + "objective/non_score_reward": -0.1643100529909134, + "objective/rlhf_reward": 9.618223190307617, + "objective/scores": 9.782532691955566, + "policy/approxkl_avg": 0.29179248213768005, + "policy/clipfrac_avg": 0.33203125, + "policy/entropy_avg": 0.3870548605918884, + "step": 85, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2167, + "val/ratio": 1.0001106262207031, + "val/ratio_var": 3.691868414534838e-06 + }, + { + "episode": 44544, + "epoch": 0.190815627141878, + "eps": 6, + "loss/policy_avg": 0.1226603090763092, + "lr": 1.9921875e-06, + "objective/entropy": -35.539337158203125, + "objective/kl": 33.36035919189453, + "objective/non_score_reward": -0.1668018102645874, + "objective/rlhf_reward": 9.765243530273438, + "objective/scores": 9.932045936584473, + "policy/approxkl_avg": 0.23382121324539185, + "policy/clipfrac_avg": 0.365234375, + "policy/entropy_avg": 0.39480116963386536, + "step": 86, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2204, + "val/ratio": 0.9996500015258789, + "val/ratio_var": 3.449556515988661e-06 + }, + { + "episode": 45056, + "epoch": 0.1930089102124743, + "eps": 6, + "loss/policy_avg": 0.22705024480819702, + "lr": 1.98046875e-06, + "objective/entropy": -34.78813171386719, + "objective/kl": 33.85824203491211, + "objective/non_score_reward": -0.1692911982536316, + "objective/rlhf_reward": 9.508092880249023, + "objective/scores": 9.677383422851562, + "policy/approxkl_avg": 0.2564248740673065, + "policy/clipfrac_avg": 0.341796875, + "policy/entropy_avg": 0.39244288206100464, + "step": 87, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2230, + "val/ratio": 0.9998552799224854, + "val/ratio_var": 2.9433140298351645e-06 + }, + { + "episode": 45568, + "epoch": 0.1952021932830706, + "eps": 6, + "loss/policy_avg": 0.25029683113098145, + "lr": 1.96875e-06, + "objective/entropy": -38.32817077636719, + "objective/kl": 32.53593826293945, + "objective/non_score_reward": -0.16267967224121094, + "objective/rlhf_reward": 9.676530838012695, + "objective/scores": 9.839210510253906, + "policy/approxkl_avg": 0.24715110659599304, + "policy/clipfrac_avg": 0.345703125, + "policy/entropy_avg": 0.36855989694595337, + "step": 88, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2186, + "val/ratio": 1.0001134872436523, + "val/ratio_var": 3.477802692941623e-06 + }, + { + "episode": 46080, + "epoch": 0.1973954763536669, + "eps": 6, + "loss/policy_avg": 0.2676100730895996, + "lr": 1.95703125e-06, + "objective/entropy": -34.89455795288086, + "objective/kl": 34.006202697753906, + "objective/non_score_reward": -0.17003101110458374, + "objective/rlhf_reward": 9.682475090026855, + "objective/scores": 9.852506637573242, + "policy/approxkl_avg": 0.2565373182296753, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.38856256008148193, + "step": 89, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2148, + "val/ratio": 0.9999434947967529, + "val/ratio_var": 3.6854912650596816e-06 + }, + { + "episode": 46592, + "epoch": 0.19958875942426318, + "eps": 6, + "loss/policy_avg": 0.17851054668426514, + "lr": 1.9453125e-06, + "objective/entropy": -37.629817962646484, + "objective/kl": 33.07708740234375, + "objective/non_score_reward": -0.1653854250907898, + "objective/rlhf_reward": 9.640422821044922, + "objective/scores": 9.805809020996094, + "policy/approxkl_avg": 0.2017795443534851, + "policy/clipfrac_avg": 0.34765625, + "policy/entropy_avg": 0.37295690178871155, + "step": 90, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2112, + "val/ratio": 1.0001370906829834, + "val/ratio_var": 2.355475317017408e-06 + }, + { + "episode": 47104, + "epoch": 0.20178204249485948, + "eps": 6, + "loss/policy_avg": 0.1686045229434967, + "lr": 1.93359375e-06, + "objective/entropy": -38.061744689941406, + "objective/kl": 32.129512786865234, + "objective/non_score_reward": -0.16064755618572235, + "objective/rlhf_reward": 9.790443420410156, + "objective/scores": 9.951091766357422, + "policy/approxkl_avg": 0.21093934774398804, + "policy/clipfrac_avg": 0.36328125, + "policy/entropy_avg": 0.3660760521888733, + "step": 91, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2086, + "val/ratio": 0.9999697208404541, + "val/ratio_var": 3.8051555293350248e-06 + }, + { + "episode": 47616, + "epoch": 0.20397532556545578, + "eps": 6, + "loss/policy_avg": 0.20496009290218353, + "lr": 1.921875e-06, + "objective/entropy": -38.42815399169922, + "objective/kl": 33.15034484863281, + "objective/non_score_reward": -0.16575172543525696, + "objective/rlhf_reward": 9.645820617675781, + "objective/scores": 9.811572074890137, + "policy/approxkl_avg": 0.2742815613746643, + "policy/clipfrac_avg": 0.353515625, + "policy/entropy_avg": 0.35775893926620483, + "step": 92, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2077, + "val/ratio": 1.0003482103347778, + "val/ratio_var": 3.3726596484484617e-06 + }, + { + "episode": 48128, + "epoch": 0.2061686086360521, + "eps": 6, + "loss/policy_avg": 0.1944173276424408, + "lr": 1.91015625e-06, + "objective/entropy": -37.453712463378906, + "objective/kl": 33.70136260986328, + "objective/non_score_reward": -0.16850680112838745, + "objective/rlhf_reward": 9.822844505310059, + "objective/scores": 9.991350173950195, + "policy/approxkl_avg": 0.264553964138031, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.3743836581707001, + "step": 93, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2082, + "val/ratio": 0.9996594786643982, + "val/ratio_var": 2.970580226246966e-06 + }, + { + "episode": 48640, + "epoch": 0.2083618917066484, + "eps": 6, + "loss/policy_avg": 0.20501361787319183, + "lr": 1.8984375e-06, + "objective/entropy": -36.34868621826172, + "objective/kl": 33.67938995361328, + "objective/non_score_reward": -0.1683969646692276, + "objective/rlhf_reward": 9.771129608154297, + "objective/scores": 9.939525604248047, + "policy/approxkl_avg": 0.29675331711769104, + "policy/clipfrac_avg": 0.31640625, + "policy/entropy_avg": 0.35725563764572144, + "step": 94, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2083, + "val/ratio": 1.0001224279403687, + "val/ratio_var": 3.695943178172456e-06 + }, + { + "episode": 49152, + "epoch": 0.2105551747772447, + "eps": 6, + "loss/policy_avg": 0.19600501656532288, + "lr": 1.8867187500000001e-06, + "objective/entropy": -34.53193664550781, + "objective/kl": 34.90612030029297, + "objective/non_score_reward": -0.17453059554100037, + "objective/rlhf_reward": 9.953956604003906, + "objective/scores": 10.128486633300781, + "policy/approxkl_avg": 0.21787747740745544, + "policy/clipfrac_avg": 0.318359375, + "policy/entropy_avg": 0.339312881231308, + "step": 95, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1995, + "val/ratio": 0.999893844127655, + "val/ratio_var": 2.4975431642815238e-06 + }, + { + "episode": 49664, + "epoch": 0.212748457847841, + "eps": 6, + "loss/policy_avg": 0.22088220715522766, + "lr": 1.875e-06, + "objective/entropy": -34.256370544433594, + "objective/kl": 35.238216400146484, + "objective/non_score_reward": -0.1761910766363144, + "objective/rlhf_reward": 9.564779281616211, + "objective/scores": 9.74096965789795, + "policy/approxkl_avg": 0.245886892080307, + "policy/clipfrac_avg": 0.35546875, + "policy/entropy_avg": 0.35175642371177673, + "step": 96, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1972, + "val/ratio": 0.9998222589492798, + "val/ratio_var": 3.3520850593049545e-06 + }, + { + "episode": 50176, + "epoch": 0.2149417409184373, + "eps": 6, + "loss/policy_avg": 0.1739550083875656, + "lr": 1.86328125e-06, + "objective/entropy": -36.02185821533203, + "objective/kl": 35.261474609375, + "objective/non_score_reward": -0.17630735039710999, + "objective/rlhf_reward": 9.913810729980469, + "objective/scores": 10.090118408203125, + "policy/approxkl_avg": 0.19460904598236084, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.3500151038169861, + "step": 97, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2014, + "val/ratio": 1.0000877380371094, + "val/ratio_var": 2.4969581318146084e-06 + }, + { + "episode": 50688, + "epoch": 0.2171350239890336, + "eps": 6, + "loss/policy_avg": 0.27473074197769165, + "lr": 1.8515625000000001e-06, + "objective/entropy": -35.280155181884766, + "objective/kl": 36.09398651123047, + "objective/non_score_reward": -0.18046993017196655, + "objective/rlhf_reward": 9.731412887573242, + "objective/scores": 9.911883354187012, + "policy/approxkl_avg": 0.23806479573249817, + "policy/clipfrac_avg": 0.34765625, + "policy/entropy_avg": 0.36351966857910156, + "step": 98, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2009, + "val/ratio": 1.0001752376556396, + "val/ratio_var": 4.085913587914547e-06 + }, + { + "episode": 51200, + "epoch": 0.21932830705962988, + "eps": 6, + "loss/policy_avg": 0.28736796975135803, + "lr": 1.83984375e-06, + "objective/entropy": -35.07633972167969, + "objective/kl": 35.50556182861328, + "objective/non_score_reward": -0.1775278002023697, + "objective/rlhf_reward": 9.971885681152344, + "objective/scores": 10.1494140625, + "policy/approxkl_avg": 0.22533981502056122, + "policy/clipfrac_avg": 0.345703125, + "policy/entropy_avg": 0.33290359377861023, + "step": 99, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1977, + "val/ratio": 0.9998661279678345, + "val/ratio_var": 5.175881597097032e-06 + }, + { + "episode": 51712, + "epoch": 0.22152159013022618, + "eps": 6, + "loss/policy_avg": 0.24332170188426971, + "lr": 1.828125e-06, + "objective/entropy": -31.962793350219727, + "objective/kl": 36.79883575439453, + "objective/non_score_reward": -0.18399417400360107, + "objective/rlhf_reward": 9.824310302734375, + "objective/scores": 10.008304595947266, + "policy/approxkl_avg": 0.2577008008956909, + "policy/clipfrac_avg": 0.40234375, + "policy/entropy_avg": 0.3309575021266937, + "step": 100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1903, + "val/ratio": 1.000428318977356, + "val/ratio_var": 2.6899542717728764e-06 + }, + { + "episode": 52224, + "epoch": 0.22371487320082248, + "eps": 6, + "loss/policy_avg": 0.17923694849014282, + "lr": 1.81640625e-06, + "objective/entropy": -34.664024353027344, + "objective/kl": 35.354034423828125, + "objective/non_score_reward": -0.1767701804637909, + "objective/rlhf_reward": 10.139245986938477, + "objective/scores": 10.316017150878906, + "policy/approxkl_avg": 0.19910496473312378, + "policy/clipfrac_avg": 0.31640625, + "policy/entropy_avg": 0.3107799291610718, + "step": 101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1891, + "val/ratio": 0.9996460676193237, + "val/ratio_var": 3.313231900392566e-06 + }, + { + "episode": 52736, + "epoch": 0.22590815627141878, + "eps": 6, + "loss/policy_avg": 0.17553430795669556, + "lr": 1.8046875e-06, + "objective/entropy": -34.35253143310547, + "objective/kl": 35.79362869262695, + "objective/non_score_reward": -0.1789681315422058, + "objective/rlhf_reward": 10.026729583740234, + "objective/scores": 10.205698013305664, + "policy/approxkl_avg": 0.21303945779800415, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.3087007999420166, + "step": 102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1905, + "val/ratio": 0.9998683929443359, + "val/ratio_var": 3.267524562033941e-06 + }, + { + "episode": 53248, + "epoch": 0.22810143934201507, + "eps": 6, + "loss/policy_avg": 0.20496255159378052, + "lr": 1.79296875e-06, + "objective/entropy": -34.044639587402344, + "objective/kl": 35.621826171875, + "objective/non_score_reward": -0.17810912430286407, + "objective/rlhf_reward": 10.048910140991211, + "objective/scores": 10.227019309997559, + "policy/approxkl_avg": 0.2181515246629715, + "policy/clipfrac_avg": 0.357421875, + "policy/entropy_avg": 0.3069969415664673, + "step": 103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1940, + "val/ratio": 0.9999847412109375, + "val/ratio_var": 3.6619542242988246e-06 + }, + { + "episode": 53760, + "epoch": 0.23029472241261137, + "eps": 6, + "loss/policy_avg": 0.22224432229995728, + "lr": 1.78125e-06, + "objective/entropy": -32.43903732299805, + "objective/kl": 36.86625289916992, + "objective/non_score_reward": -0.1843312531709671, + "objective/rlhf_reward": 9.84052562713623, + "objective/scores": 10.024856567382812, + "policy/approxkl_avg": 0.1902885138988495, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.30074501037597656, + "step": 104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1854, + "val/ratio": 1.0004634857177734, + "val/ratio_var": 2.30164823733503e-06 + }, + { + "episode": 54272, + "epoch": 0.23248800548320767, + "eps": 6, + "loss/policy_avg": 0.13901078701019287, + "lr": 1.76953125e-06, + "objective/entropy": -33.599708557128906, + "objective/kl": 35.801998138427734, + "objective/non_score_reward": -0.17900997400283813, + "objective/rlhf_reward": 9.889688491821289, + "objective/scores": 10.06869888305664, + "policy/approxkl_avg": 0.16544947028160095, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.30036184191703796, + "step": 105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1859, + "val/ratio": 1.0002105236053467, + "val/ratio_var": 2.6306556719646323e-06 + }, + { + "episode": 54784, + "epoch": 0.23468128855380396, + "eps": 6, + "loss/policy_avg": 0.14493289589881897, + "lr": 1.7578125e-06, + "objective/entropy": -33.55315017700195, + "objective/kl": 35.825531005859375, + "objective/non_score_reward": -0.17912766337394714, + "objective/rlhf_reward": 9.871797561645508, + "objective/scores": 10.050926208496094, + "policy/approxkl_avg": 0.1833547204732895, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.2901462912559509, + "step": 106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1835, + "val/ratio": 0.9998902678489685, + "val/ratio_var": 2.489499593139044e-06 + }, + { + "episode": 55296, + "epoch": 0.2368745716244003, + "eps": 6, + "loss/policy_avg": 0.17336603999137878, + "lr": 1.74609375e-06, + "objective/entropy": -34.462337493896484, + "objective/kl": 37.12619400024414, + "objective/non_score_reward": -0.18563096225261688, + "objective/rlhf_reward": 10.155291557312012, + "objective/scores": 10.340921401977539, + "policy/approxkl_avg": 0.18350008130073547, + "policy/clipfrac_avg": 0.3359375, + "policy/entropy_avg": 0.29074928164482117, + "step": 107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1954, + "val/ratio": 1.0000966787338257, + "val/ratio_var": 2.0360935195640195e-06 + }, + { + "episode": 55808, + "epoch": 0.23906785469499658, + "eps": 6, + "loss/policy_avg": 0.20065046846866608, + "lr": 1.734375e-06, + "objective/entropy": -35.58911895751953, + "objective/kl": 35.46983337402344, + "objective/non_score_reward": -0.17734915018081665, + "objective/rlhf_reward": 9.807424545288086, + "objective/scores": 9.984773635864258, + "policy/approxkl_avg": 0.21259769797325134, + "policy/clipfrac_avg": 0.3359375, + "policy/entropy_avg": 0.2804480195045471, + "step": 108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1868, + "val/ratio": 0.9998682737350464, + "val/ratio_var": 4.149359028815525e-06 + }, + { + "episode": 56320, + "epoch": 0.24126113776559288, + "eps": 6, + "loss/policy_avg": 0.31327635049819946, + "lr": 1.72265625e-06, + "objective/entropy": -35.60870361328125, + "objective/kl": 34.600460052490234, + "objective/non_score_reward": -0.17300228774547577, + "objective/rlhf_reward": 9.828847885131836, + "objective/scores": 10.001850128173828, + "policy/approxkl_avg": 0.2347819209098816, + "policy/clipfrac_avg": 0.326171875, + "policy/entropy_avg": 0.2796494960784912, + "step": 109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1903, + "val/ratio": 1.000025749206543, + "val/ratio_var": 3.987162926932797e-06 + }, + { + "episode": 56832, + "epoch": 0.24345442083618918, + "eps": 6, + "loss/policy_avg": 0.19177474081516266, + "lr": 1.7109375e-06, + "objective/entropy": -35.206573486328125, + "objective/kl": 35.755706787109375, + "objective/non_score_reward": -0.1787785291671753, + "objective/rlhf_reward": 10.03399658203125, + "objective/scores": 10.212775230407715, + "policy/approxkl_avg": 0.2251434177160263, + "policy/clipfrac_avg": 0.388671875, + "policy/entropy_avg": 0.2768189311027527, + "step": 110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1854, + "val/ratio": 0.9997500777244568, + "val/ratio_var": 2.821056341417716e-06 + }, + { + "episode": 57344, + "epoch": 0.24564770390678548, + "eps": 6, + "loss/policy_avg": 0.17819970846176147, + "lr": 1.69921875e-06, + "objective/entropy": -35.49714279174805, + "objective/kl": 35.21353530883789, + "objective/non_score_reward": -0.17606768012046814, + "objective/rlhf_reward": 9.934215545654297, + "objective/scores": 10.110282897949219, + "policy/approxkl_avg": 0.2090197205543518, + "policy/clipfrac_avg": 0.30859375, + "policy/entropy_avg": 0.2791983485221863, + "step": 111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1825, + "val/ratio": 1.0002068281173706, + "val/ratio_var": 3.191226596754859e-06 + }, + { + "episode": 57856, + "epoch": 0.24784098697738177, + "eps": 6, + "loss/policy_avg": 0.13477346301078796, + "lr": 1.6875e-06, + "objective/entropy": -35.38134002685547, + "objective/kl": 35.48783874511719, + "objective/non_score_reward": -0.17743918299674988, + "objective/rlhf_reward": 10.00422191619873, + "objective/scores": 10.181660652160645, + "policy/approxkl_avg": 0.21763411164283752, + "policy/clipfrac_avg": 0.37109375, + "policy/entropy_avg": 0.27607789635658264, + "step": 112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1811, + "val/ratio": 1.0001035928726196, + "val/ratio_var": 3.5984410260425648e-06 + }, + { + "episode": 58368, + "epoch": 0.25003427004797807, + "eps": 6, + "loss/policy_avg": 0.16193845868110657, + "lr": 1.67578125e-06, + "objective/entropy": -35.136634826660156, + "objective/kl": 35.53947830200195, + "objective/non_score_reward": -0.17769737541675568, + "objective/rlhf_reward": 10.041696548461914, + "objective/scores": 10.219392776489258, + "policy/approxkl_avg": 0.18851657211780548, + "policy/clipfrac_avg": 0.322265625, + "policy/entropy_avg": 0.2754863500595093, + "step": 113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1822, + "val/ratio": 0.9998536109924316, + "val/ratio_var": 2.3457650968339294e-06 + }, + { + "episode": 58880, + "epoch": 0.25222755311857437, + "eps": 6, + "loss/policy_avg": 0.2662353217601776, + "lr": 1.6640625e-06, + "objective/entropy": -36.19813537597656, + "objective/kl": 35.548973083496094, + "objective/non_score_reward": -0.17774485051631927, + "objective/rlhf_reward": 9.807806015014648, + "objective/scores": 9.985549926757812, + "policy/approxkl_avg": 0.22413387894630432, + "policy/clipfrac_avg": 0.318359375, + "policy/entropy_avg": 0.28221410512924194, + "step": 114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1945, + "val/ratio": 1.0000545978546143, + "val/ratio_var": 3.614477236624225e-06 + }, + { + "episode": 59392, + "epoch": 0.25442083618917066, + "eps": 6, + "loss/policy_avg": 0.1269383728504181, + "lr": 1.6523437500000001e-06, + "objective/entropy": -37.98196792602539, + "objective/kl": 34.821964263916016, + "objective/non_score_reward": -0.1741098165512085, + "objective/rlhf_reward": 9.85723876953125, + "objective/scores": 10.031347274780273, + "policy/approxkl_avg": 0.21144835650920868, + "policy/clipfrac_avg": 0.31640625, + "policy/entropy_avg": 0.27860069274902344, + "step": 115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1918, + "val/ratio": 1.0000715255737305, + "val/ratio_var": 4.044930847157957e-06 + }, + { + "episode": 59904, + "epoch": 0.25661411925976696, + "eps": 6, + "loss/policy_avg": 0.18478526175022125, + "lr": 1.640625e-06, + "objective/entropy": -39.03168869018555, + "objective/kl": 34.09989929199219, + "objective/non_score_reward": -0.1704995036125183, + "objective/rlhf_reward": 10.072548866271973, + "objective/scores": 10.243047714233398, + "policy/approxkl_avg": 0.18763652443885803, + "policy/clipfrac_avg": 0.32421875, + "policy/entropy_avg": 0.276102751493454, + "step": 116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1964, + "val/ratio": 1.0003482103347778, + "val/ratio_var": 3.092050519626355e-06 + }, + { + "episode": 60416, + "epoch": 0.25880740233036326, + "eps": 6, + "loss/policy_avg": 0.2445678561925888, + "lr": 1.62890625e-06, + "objective/entropy": -37.85968017578125, + "objective/kl": 34.59711456298828, + "objective/non_score_reward": -0.17298556864261627, + "objective/rlhf_reward": 9.838810920715332, + "objective/scores": 10.011796951293945, + "policy/approxkl_avg": 0.2142867147922516, + "policy/clipfrac_avg": 0.287109375, + "policy/entropy_avg": 0.2769862413406372, + "step": 117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1913, + "val/ratio": 1.0000524520874023, + "val/ratio_var": 2.518341943869018e-06 + }, + { + "episode": 60928, + "epoch": 0.26100068540095955, + "eps": 6, + "loss/policy_avg": 0.14503635466098785, + "lr": 1.6171875000000001e-06, + "objective/entropy": -42.66570281982422, + "objective/kl": 33.559688568115234, + "objective/non_score_reward": -0.1677984595298767, + "objective/rlhf_reward": 10.2135591506958, + "objective/scores": 10.381357192993164, + "policy/approxkl_avg": 0.17557722330093384, + "policy/clipfrac_avg": 0.267578125, + "policy/entropy_avg": 0.27216678857803345, + "step": 118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1978, + "val/ratio": 0.9997793436050415, + "val/ratio_var": 2.1884602574573364e-06 + }, + { + "episode": 61440, + "epoch": 0.26319396847155585, + "eps": 6, + "loss/policy_avg": 0.12860137224197388, + "lr": 1.60546875e-06, + "objective/entropy": -40.882850646972656, + "objective/kl": 35.01697540283203, + "objective/non_score_reward": -0.17508485913276672, + "objective/rlhf_reward": 9.812532424926758, + "objective/scores": 9.987617492675781, + "policy/approxkl_avg": 0.21601755917072296, + "policy/clipfrac_avg": 0.333984375, + "policy/entropy_avg": 0.2853110432624817, + "step": 119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1991, + "val/ratio": 1.0000758171081543, + "val/ratio_var": 3.1630045214114944e-06 + }, + { + "episode": 61952, + "epoch": 0.26538725154215215, + "eps": 6, + "loss/policy_avg": 0.13918112218379974, + "lr": 1.59375e-06, + "objective/entropy": -42.673091888427734, + "objective/kl": 33.7802734375, + "objective/non_score_reward": -0.16890135407447815, + "objective/rlhf_reward": 9.991598129272461, + "objective/scores": 10.160500526428223, + "policy/approxkl_avg": 0.2096889615058899, + "policy/clipfrac_avg": 0.3125, + "policy/entropy_avg": 0.2698075771331787, + "step": 120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1962, + "val/ratio": 0.9996597766876221, + "val/ratio_var": 2.5640338208177127e-06 + }, + { + "episode": 62464, + "epoch": 0.26758053461274844, + "eps": 6, + "loss/policy_avg": 0.14262011647224426, + "lr": 1.5820312500000001e-06, + "objective/entropy": -40.981964111328125, + "objective/kl": 33.93827819824219, + "objective/non_score_reward": -0.16969136893749237, + "objective/rlhf_reward": 9.967924118041992, + "objective/scores": 10.137615203857422, + "policy/approxkl_avg": 0.17477768659591675, + "policy/clipfrac_avg": 0.326171875, + "policy/entropy_avg": 0.27177703380584717, + "step": 121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1948, + "val/ratio": 1.0003581047058105, + "val/ratio_var": 2.333908696527942e-06 + }, + { + "episode": 62976, + "epoch": 0.26977381768334474, + "eps": 6, + "loss/policy_avg": 0.1601879894733429, + "lr": 1.5703125e-06, + "objective/entropy": -39.27589797973633, + "objective/kl": 34.60676574707031, + "objective/non_score_reward": -0.17303383350372314, + "objective/rlhf_reward": 9.912843704223633, + "objective/scores": 10.085877418518066, + "policy/approxkl_avg": 0.20070144534111023, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.2859034836292267, + "step": 122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1959, + "val/ratio": 0.9996767044067383, + "val/ratio_var": 2.508757233954384e-06 + }, + { + "episode": 63488, + "epoch": 0.27196710075394104, + "eps": 6, + "loss/policy_avg": 0.15391629934310913, + "lr": 1.55859375e-06, + "objective/entropy": -38.832088470458984, + "objective/kl": 35.0777473449707, + "objective/non_score_reward": -0.17538872361183167, + "objective/rlhf_reward": 9.996062278747559, + "objective/scores": 10.1714506149292, + "policy/approxkl_avg": 0.22946523129940033, + "policy/clipfrac_avg": 0.326171875, + "policy/entropy_avg": 0.27930349111557007, + "step": 123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1959, + "val/ratio": 0.9997979402542114, + "val/ratio_var": 3.5869422845280496e-06 + }, + { + "episode": 64000, + "epoch": 0.27416038382453733, + "eps": 6, + "loss/policy_avg": 0.16380949318408966, + "lr": 1.5468750000000001e-06, + "objective/entropy": -39.75455856323242, + "objective/kl": 35.5320930480957, + "objective/non_score_reward": -0.17766046524047852, + "objective/rlhf_reward": 9.862564086914062, + "objective/scores": 10.040224075317383, + "policy/approxkl_avg": 0.1650054007768631, + "policy/clipfrac_avg": 0.318359375, + "policy/entropy_avg": 0.28795722126960754, + "step": 124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1916, + "val/ratio": 0.9999822378158569, + "val/ratio_var": 2.5005276711453917e-06 + }, + { + "episode": 64512, + "epoch": 0.27635366689513363, + "eps": 6, + "loss/policy_avg": 0.14548206329345703, + "lr": 1.53515625e-06, + "objective/entropy": -40.37278747558594, + "objective/kl": 34.826271057128906, + "objective/non_score_reward": -0.1741313487291336, + "objective/rlhf_reward": 10.072320938110352, + "objective/scores": 10.246453285217285, + "policy/approxkl_avg": 0.1895635575056076, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.28454479575157166, + "step": 125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1992, + "val/ratio": 0.999925971031189, + "val/ratio_var": 2.291703822265845e-06 + }, + { + "episode": 65024, + "epoch": 0.27854694996572993, + "eps": 6, + "loss/policy_avg": 0.17899231612682343, + "lr": 1.5234375e-06, + "objective/entropy": -39.21260070800781, + "objective/kl": 36.2283935546875, + "objective/non_score_reward": -0.18114197254180908, + "objective/rlhf_reward": 9.876224517822266, + "objective/scores": 10.057367324829102, + "policy/approxkl_avg": 0.17429962754249573, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.27740728855133057, + "step": 126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1933, + "val/ratio": 1.000385046005249, + "val/ratio_var": 2.276981604154571e-06 + }, + { + "episode": 65536, + "epoch": 0.2807402330363262, + "eps": 6, + "loss/policy_avg": 0.2138238549232483, + "lr": 1.5117187500000001e-06, + "objective/entropy": -40.66987991333008, + "objective/kl": 34.04345703125, + "objective/non_score_reward": -0.17021729052066803, + "objective/rlhf_reward": 10.10722541809082, + "objective/scores": 10.277442932128906, + "policy/approxkl_avg": 0.184827983379364, + "policy/clipfrac_avg": 0.306640625, + "policy/entropy_avg": 0.2644522190093994, + "step": 127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1960, + "val/ratio": 0.9999203681945801, + "val/ratio_var": 2.535048452045885e-06 + }, + { + "episode": 66048, + "epoch": 0.2829335161069225, + "eps": 6, + "loss/policy_avg": 0.17386293411254883, + "lr": 1.5e-06, + "objective/entropy": -38.950401306152344, + "objective/kl": 36.11156463623047, + "objective/non_score_reward": -0.18055780231952667, + "objective/rlhf_reward": 9.913749694824219, + "objective/scores": 10.094307899475098, + "policy/approxkl_avg": 0.2340121865272522, + "policy/clipfrac_avg": 0.283203125, + "policy/entropy_avg": 0.26781517267227173, + "step": 128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1912, + "val/ratio": 0.9998964667320251, + "val/ratio_var": 2.7165133360540494e-06 + }, + { + "episode": 66560, + "epoch": 0.2851267991775189, + "eps": 6, + "loss/policy_avg": 0.11376126110553741, + "lr": 1.48828125e-06, + "objective/entropy": -37.08405303955078, + "objective/kl": 36.694183349609375, + "objective/non_score_reward": -0.1834709197282791, + "objective/rlhf_reward": 9.986257553100586, + "objective/scores": 10.169729232788086, + "policy/approxkl_avg": 0.19061046838760376, + "policy/clipfrac_avg": 0.296875, + "policy/entropy_avg": 0.2567726969718933, + "step": 129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1826, + "val/ratio": 0.9997936487197876, + "val/ratio_var": 3.173572395098745e-06 + }, + { + "episode": 67072, + "epoch": 0.28732008224811517, + "eps": 6, + "loss/policy_avg": 0.16814622282981873, + "lr": 1.4765625e-06, + "objective/entropy": -37.90453338623047, + "objective/kl": 36.63753128051758, + "objective/non_score_reward": -0.18318764865398407, + "objective/rlhf_reward": 9.85715103149414, + "objective/scores": 10.040338516235352, + "policy/approxkl_avg": 0.2051369547843933, + "policy/clipfrac_avg": 0.345703125, + "policy/entropy_avg": 0.2719249129295349, + "step": 130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1896, + "val/ratio": 1.0000419616699219, + "val/ratio_var": 2.5849769826891134e-06 + }, + { + "episode": 67584, + "epoch": 0.28951336531871147, + "eps": 6, + "loss/policy_avg": 0.11204706132411957, + "lr": 1.46484375e-06, + "objective/entropy": -38.804443359375, + "objective/kl": 36.267181396484375, + "objective/non_score_reward": -0.181335911154747, + "objective/rlhf_reward": 10.217302322387695, + "objective/scores": 10.398637771606445, + "policy/approxkl_avg": 0.24441036581993103, + "policy/clipfrac_avg": 0.3125, + "policy/entropy_avg": 0.25722193717956543, + "step": 131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1899, + "val/ratio": 0.9999180436134338, + "val/ratio_var": 3.2145369459612994e-06 + }, + { + "episode": 68096, + "epoch": 0.29170664838930777, + "eps": 6, + "loss/policy_avg": 0.1794815957546234, + "lr": 1.453125e-06, + "objective/entropy": -35.273468017578125, + "objective/kl": 37.08363342285156, + "objective/non_score_reward": -0.18541815876960754, + "objective/rlhf_reward": 9.986465454101562, + "objective/scores": 10.171884536743164, + "policy/approxkl_avg": 0.24921618402004242, + "policy/clipfrac_avg": 0.30859375, + "policy/entropy_avg": 0.2578710913658142, + "step": 132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1833, + "val/ratio": 0.9998206496238708, + "val/ratio_var": 3.307860424683895e-06 + }, + { + "episode": 68608, + "epoch": 0.29389993145990406, + "eps": 6, + "loss/policy_avg": 0.16161808371543884, + "lr": 1.44140625e-06, + "objective/entropy": -35.44409942626953, + "objective/kl": 37.412166595458984, + "objective/non_score_reward": -0.18706083297729492, + "objective/rlhf_reward": 9.986528396606445, + "objective/scores": 10.173589706420898, + "policy/approxkl_avg": 0.22423642873764038, + "policy/clipfrac_avg": 0.32421875, + "policy/entropy_avg": 0.2606107294559479, + "step": 133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1926, + "val/ratio": 1.0000818967819214, + "val/ratio_var": 3.4354557101323735e-06 + }, + { + "episode": 69120, + "epoch": 0.29609321453050036, + "eps": 6, + "loss/policy_avg": 0.19688278436660767, + "lr": 1.4296875e-06, + "objective/entropy": -35.44132614135742, + "objective/kl": 38.76708984375, + "objective/non_score_reward": -0.19383545219898224, + "objective/rlhf_reward": 10.073609352111816, + "objective/scores": 10.26744556427002, + "policy/approxkl_avg": 0.22920569777488708, + "policy/clipfrac_avg": 0.333984375, + "policy/entropy_avg": 0.2607892155647278, + "step": 134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1839, + "val/ratio": 1.0001198053359985, + "val/ratio_var": 3.989252945757471e-06 + }, + { + "episode": 69632, + "epoch": 0.29828649760109666, + "eps": 6, + "loss/policy_avg": 0.15440833568572998, + "lr": 1.41796875e-06, + "objective/entropy": -34.1768798828125, + "objective/kl": 37.674827575683594, + "objective/non_score_reward": -0.1883741319179535, + "objective/rlhf_reward": 10.175031661987305, + "objective/scores": 10.363405227661133, + "policy/approxkl_avg": 0.21583065390586853, + "policy/clipfrac_avg": 0.32421875, + "policy/entropy_avg": 0.2518673539161682, + "step": 135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1858, + "val/ratio": 0.9998531937599182, + "val/ratio_var": 1.8523942344472744e-06 + }, + { + "episode": 70144, + "epoch": 0.30047978067169295, + "eps": 6, + "loss/policy_avg": 0.11003963649272919, + "lr": 1.40625e-06, + "objective/entropy": -37.877197265625, + "objective/kl": 37.75031280517578, + "objective/non_score_reward": -0.18875157833099365, + "objective/rlhf_reward": 10.152512550354004, + "objective/scores": 10.341262817382812, + "policy/approxkl_avg": 0.20760369300842285, + "policy/clipfrac_avg": 0.34375, + "policy/entropy_avg": 0.2564089000225067, + "step": 136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1851, + "val/ratio": 0.9998672008514404, + "val/ratio_var": 2.3049710762279574e-06 + }, + { + "episode": 70656, + "epoch": 0.30267306374228925, + "eps": 6, + "loss/policy_avg": 0.14345203340053558, + "lr": 1.39453125e-06, + "objective/entropy": -35.715049743652344, + "objective/kl": 37.490478515625, + "objective/non_score_reward": -0.18745239078998566, + "objective/rlhf_reward": 10.060314178466797, + "objective/scores": 10.247767448425293, + "policy/approxkl_avg": 0.1824231892824173, + "policy/clipfrac_avg": 0.341796875, + "policy/entropy_avg": 0.2563643455505371, + "step": 137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1824, + "val/ratio": 0.9999721050262451, + "val/ratio_var": 2.2789722606830765e-06 + }, + { + "episode": 71168, + "epoch": 0.30486634681288555, + "eps": 6, + "loss/policy_avg": 0.19940869510173798, + "lr": 1.3828125e-06, + "objective/entropy": -36.189083099365234, + "objective/kl": 38.07117462158203, + "objective/non_score_reward": -0.19035586714744568, + "objective/rlhf_reward": 10.252054214477539, + "objective/scores": 10.442410469055176, + "policy/approxkl_avg": 0.29191023111343384, + "policy/clipfrac_avg": 0.33984375, + "policy/entropy_avg": 0.2528005838394165, + "step": 138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1870, + "val/ratio": 1.0000394582748413, + "val/ratio_var": 2.7595958727033576e-06 + }, + { + "episode": 71680, + "epoch": 0.30705962988348184, + "eps": 6, + "loss/policy_avg": 0.231558695435524, + "lr": 1.37109375e-06, + "objective/entropy": -35.53020477294922, + "objective/kl": 38.774810791015625, + "objective/non_score_reward": -0.1938740611076355, + "objective/rlhf_reward": 10.077913284301758, + "objective/scores": 10.271787643432617, + "policy/approxkl_avg": 0.243779718875885, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.2702625095844269, + "step": 139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1941, + "val/ratio": 0.9997032880783081, + "val/ratio_var": 3.4043978303088807e-06 + }, + { + "episode": 72192, + "epoch": 0.30925291295407814, + "eps": 6, + "loss/policy_avg": 0.18193811178207397, + "lr": 1.359375e-06, + "objective/entropy": -35.06866455078125, + "objective/kl": 38.534996032714844, + "objective/non_score_reward": -0.19267499446868896, + "objective/rlhf_reward": 10.014896392822266, + "objective/scores": 10.207571983337402, + "policy/approxkl_avg": 0.19139140844345093, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.2817988693714142, + "step": 140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1894, + "val/ratio": 1.000110149383545, + "val/ratio_var": 3.122817133771605e-06 + }, + { + "episode": 72704, + "epoch": 0.31144619602467444, + "eps": 6, + "loss/policy_avg": 0.11852683871984482, + "lr": 1.34765625e-06, + "objective/entropy": -34.31420135498047, + "objective/kl": 38.61653137207031, + "objective/non_score_reward": -0.19308267533779144, + "objective/rlhf_reward": 10.067350387573242, + "objective/scores": 10.260433197021484, + "policy/approxkl_avg": 0.1943662464618683, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.2707614600658417, + "step": 141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1918, + "val/ratio": 0.9998263716697693, + "val/ratio_var": 1.7580903204361675e-06 + }, + { + "episode": 73216, + "epoch": 0.31363947909527073, + "eps": 6, + "loss/policy_avg": 0.14486292004585266, + "lr": 1.3359375e-06, + "objective/entropy": -39.37120056152344, + "objective/kl": 38.30462646484375, + "objective/non_score_reward": -0.19152314960956573, + "objective/rlhf_reward": 10.148775100708008, + "objective/scores": 10.34029769897461, + "policy/approxkl_avg": 0.1603834629058838, + "policy/clipfrac_avg": 0.30078125, + "policy/entropy_avg": 0.27501171827316284, + "step": 142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2050, + "val/ratio": 1.0004277229309082, + "val/ratio_var": 3.1276979370886693e-06 + }, + { + "episode": 73728, + "epoch": 0.31583276216586703, + "eps": 6, + "loss/policy_avg": 0.16047213971614838, + "lr": 1.32421875e-06, + "objective/entropy": -37.26971435546875, + "objective/kl": 38.991920471191406, + "objective/non_score_reward": -0.1949596107006073, + "objective/rlhf_reward": 10.02863883972168, + "objective/scores": 10.22359848022461, + "policy/approxkl_avg": 0.1671905815601349, + "policy/clipfrac_avg": 0.326171875, + "policy/entropy_avg": 0.2872776985168457, + "step": 143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2020, + "val/ratio": 1.0000941753387451, + "val/ratio_var": 2.1057860521978e-06 + }, + { + "episode": 74240, + "epoch": 0.31802604523646333, + "eps": 6, + "loss/policy_avg": 0.1813797652721405, + "lr": 1.3125000000000001e-06, + "objective/entropy": -36.97285461425781, + "objective/kl": 37.837974548339844, + "objective/non_score_reward": -0.1891898661851883, + "objective/rlhf_reward": 10.044130325317383, + "objective/scores": 10.233320236206055, + "policy/approxkl_avg": 0.2248232215642929, + "policy/clipfrac_avg": 0.310546875, + "policy/entropy_avg": 0.2798747420310974, + "step": 144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2068, + "val/ratio": 0.9998457431793213, + "val/ratio_var": 2.881835371226771e-06 + }, + { + "episode": 74752, + "epoch": 0.3202193283070596, + "eps": 6, + "loss/policy_avg": 0.13438820838928223, + "lr": 1.30078125e-06, + "objective/entropy": -36.45724105834961, + "objective/kl": 38.754398345947266, + "objective/non_score_reward": -0.19377200305461884, + "objective/rlhf_reward": 9.884381294250488, + "objective/scores": 10.078152656555176, + "policy/approxkl_avg": 0.2546514570713043, + "policy/clipfrac_avg": 0.35546875, + "policy/entropy_avg": 0.2870992422103882, + "step": 145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2050, + "val/ratio": 1.0002031326293945, + "val/ratio_var": 3.175674692101893e-06 + }, + { + "episode": 75264, + "epoch": 0.3224126113776559, + "eps": 6, + "loss/policy_avg": 0.14070110023021698, + "lr": 1.2890625e-06, + "objective/entropy": -38.1695671081543, + "objective/kl": 37.88903045654297, + "objective/non_score_reward": -0.1894451379776001, + "objective/rlhf_reward": 10.021247863769531, + "objective/scores": 10.210692405700684, + "policy/approxkl_avg": 0.29337236285209656, + "policy/clipfrac_avg": 0.33203125, + "policy/entropy_avg": 0.28478890657424927, + "step": 146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2011, + "val/ratio": 0.9997515678405762, + "val/ratio_var": 2.286319158883998e-06 + }, + { + "episode": 75776, + "epoch": 0.3246058944482522, + "eps": 6, + "loss/policy_avg": 0.195893794298172, + "lr": 1.2773437500000001e-06, + "objective/entropy": -38.522796630859375, + "objective/kl": 37.41321563720703, + "objective/non_score_reward": -0.18706606328487396, + "objective/rlhf_reward": 10.484095573425293, + "objective/scores": 10.671161651611328, + "policy/approxkl_avg": 0.2040579468011856, + "policy/clipfrac_avg": 0.32421875, + "policy/entropy_avg": 0.2731180787086487, + "step": 147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2030, + "val/ratio": 1.0002871751785278, + "val/ratio_var": 3.957848093705252e-06 + }, + { + "episode": 76288, + "epoch": 0.3267991775188485, + "eps": 6, + "loss/policy_avg": 0.26951611042022705, + "lr": 1.265625e-06, + "objective/entropy": -39.15596008300781, + "objective/kl": 37.68677520751953, + "objective/non_score_reward": -0.18843387067317963, + "objective/rlhf_reward": 9.888592720031738, + "objective/scores": 10.0770263671875, + "policy/approxkl_avg": 0.22888456284999847, + "policy/clipfrac_avg": 0.341796875, + "policy/entropy_avg": 0.290910005569458, + "step": 148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2002, + "val/ratio": 0.9998010993003845, + "val/ratio_var": 3.807401981248404e-06 + }, + { + "episode": 76800, + "epoch": 0.3289924605894448, + "eps": 6, + "loss/policy_avg": 0.13903886079788208, + "lr": 1.25390625e-06, + "objective/entropy": -41.032344818115234, + "objective/kl": 37.27531051635742, + "objective/non_score_reward": -0.18637654185295105, + "objective/rlhf_reward": 10.225309371948242, + "objective/scores": 10.411685943603516, + "policy/approxkl_avg": 0.2155190408229828, + "policy/clipfrac_avg": 0.326171875, + "policy/entropy_avg": 0.28945156931877136, + "step": 149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2079, + "val/ratio": 0.9995552897453308, + "val/ratio_var": 2.633650410643895e-06 + }, + { + "episode": 77312, + "epoch": 0.3311857436600411, + "eps": 6, + "loss/policy_avg": 0.1273675560951233, + "lr": 1.2421875000000001e-06, + "objective/entropy": -39.33563232421875, + "objective/kl": 37.13164138793945, + "objective/non_score_reward": -0.18565818667411804, + "objective/rlhf_reward": 10.037923812866211, + "objective/scores": 10.22358226776123, + "policy/approxkl_avg": 0.190812885761261, + "policy/clipfrac_avg": 0.322265625, + "policy/entropy_avg": 0.301871120929718, + "step": 150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2109, + "val/ratio": 1.000533938407898, + "val/ratio_var": 2.3769293875375297e-06 + }, + { + "episode": 77824, + "epoch": 0.3333790267306374, + "eps": 6, + "loss/policy_avg": 0.1281273365020752, + "lr": 1.23046875e-06, + "objective/entropy": -42.95703125, + "objective/kl": 36.36384963989258, + "objective/non_score_reward": -0.18181924521923065, + "objective/rlhf_reward": 10.057170867919922, + "objective/scores": 10.238990783691406, + "policy/approxkl_avg": 0.2149166464805603, + "policy/clipfrac_avg": 0.29296875, + "policy/entropy_avg": 0.29416555166244507, + "step": 151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2151, + "val/ratio": 0.9998396635055542, + "val/ratio_var": 2.393756176388706e-06 + }, + { + "episode": 78336, + "epoch": 0.3355723098012337, + "eps": 6, + "loss/policy_avg": 0.14670319855213165, + "lr": 1.21875e-06, + "objective/entropy": -39.75312805175781, + "objective/kl": 38.453853607177734, + "objective/non_score_reward": -0.19226926565170288, + "objective/rlhf_reward": 9.968439102172852, + "objective/scores": 10.160709381103516, + "policy/approxkl_avg": 0.2506256699562073, + "policy/clipfrac_avg": 0.34765625, + "policy/entropy_avg": 0.31780898571014404, + "step": 152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2134, + "val/ratio": 0.999654233455658, + "val/ratio_var": 2.561226438047015e-06 + }, + { + "episode": 78848, + "epoch": 0.33776559287183, + "eps": 6, + "loss/policy_avg": 0.2252342402935028, + "lr": 1.2070312500000001e-06, + "objective/entropy": -42.7922248840332, + "objective/kl": 36.196571350097656, + "objective/non_score_reward": -0.18098285794258118, + "objective/rlhf_reward": 10.031404495239258, + "objective/scores": 10.212387084960938, + "policy/approxkl_avg": 0.20541194081306458, + "policy/clipfrac_avg": 0.361328125, + "policy/entropy_avg": 0.2943829596042633, + "step": 153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2158, + "val/ratio": 1.0003933906555176, + "val/ratio_var": 3.811472652159864e-06 + }, + { + "episode": 79360, + "epoch": 0.3399588759424263, + "eps": 6, + "loss/policy_avg": 0.15521711111068726, + "lr": 1.1953125e-06, + "objective/entropy": -42.242713928222656, + "objective/kl": 36.14505386352539, + "objective/non_score_reward": -0.18072527647018433, + "objective/rlhf_reward": 9.896879196166992, + "objective/scores": 10.077604293823242, + "policy/approxkl_avg": 0.22271820902824402, + "policy/clipfrac_avg": 0.33984375, + "policy/entropy_avg": 0.3020790219306946, + "step": 154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2169, + "val/ratio": 1.000067949295044, + "val/ratio_var": 4.0633130993228406e-06 + }, + { + "episode": 79872, + "epoch": 0.3421521590130226, + "eps": 6, + "loss/policy_avg": 0.1543515920639038, + "lr": 1.18359375e-06, + "objective/entropy": -45.61183166503906, + "objective/kl": 34.73932647705078, + "objective/non_score_reward": -0.1736966222524643, + "objective/rlhf_reward": 10.028919219970703, + "objective/scores": 10.202615737915039, + "policy/approxkl_avg": 0.20797452330589294, + "policy/clipfrac_avg": 0.314453125, + "policy/entropy_avg": 0.31615930795669556, + "step": 155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2160, + "val/ratio": 1.0000807046890259, + "val/ratio_var": 2.4551095521019306e-06 + }, + { + "episode": 80384, + "epoch": 0.3443454420836189, + "eps": 6, + "loss/policy_avg": 0.21340657770633698, + "lr": 1.1718750000000001e-06, + "objective/entropy": -44.281517028808594, + "objective/kl": 36.93424987792969, + "objective/non_score_reward": -0.18467125296592712, + "objective/rlhf_reward": 10.170047760009766, + "objective/scores": 10.354719161987305, + "policy/approxkl_avg": 0.25432729721069336, + "policy/clipfrac_avg": 0.3359375, + "policy/entropy_avg": 0.3165222704410553, + "step": 156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2221, + "val/ratio": 0.9997285604476929, + "val/ratio_var": 2.971639560200856e-06 + }, + { + "episode": 80896, + "epoch": 0.34653872515421524, + "eps": 6, + "loss/policy_avg": 0.21420806646347046, + "lr": 1.16015625e-06, + "objective/entropy": -43.32524108886719, + "objective/kl": 35.91206359863281, + "objective/non_score_reward": -0.1795603185892105, + "objective/rlhf_reward": 9.86389446258545, + "objective/scores": 10.043455123901367, + "policy/approxkl_avg": 0.20726487040519714, + "policy/clipfrac_avg": 0.34375, + "policy/entropy_avg": 0.30649200081825256, + "step": 157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2188, + "val/ratio": 1.0001165866851807, + "val/ratio_var": 2.7519311061041662e-06 + }, + { + "episode": 81408, + "epoch": 0.34873200822481154, + "eps": 6, + "loss/policy_avg": 0.17478860914707184, + "lr": 1.1484375e-06, + "objective/entropy": -45.10813903808594, + "objective/kl": 34.43164825439453, + "objective/non_score_reward": -0.17215824127197266, + "objective/rlhf_reward": 10.142086029052734, + "objective/scores": 10.314245223999023, + "policy/approxkl_avg": 0.21183699369430542, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.30924201011657715, + "step": 158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2156, + "val/ratio": 1.0000635385513306, + "val/ratio_var": 3.148848918499425e-06 + }, + { + "episode": 81920, + "epoch": 0.35092529129540784, + "eps": 6, + "loss/policy_avg": 0.19227883219718933, + "lr": 1.13671875e-06, + "objective/entropy": -45.87828826904297, + "objective/kl": 34.158355712890625, + "objective/non_score_reward": -0.17079177498817444, + "objective/rlhf_reward": 10.004722595214844, + "objective/scores": 10.175514221191406, + "policy/approxkl_avg": 0.1948489397764206, + "policy/clipfrac_avg": 0.349609375, + "policy/entropy_avg": 0.3044930398464203, + "step": 159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2136, + "val/ratio": 1.000248908996582, + "val/ratio_var": 2.8109091090300353e-06 + }, + { + "episode": 82432, + "epoch": 0.35311857436600413, + "eps": 6, + "loss/policy_avg": 0.10886512696743011, + "lr": 1.125e-06, + "objective/entropy": -48.61842346191406, + "objective/kl": 34.4404296875, + "objective/non_score_reward": -0.17220212519168854, + "objective/rlhf_reward": 9.696288108825684, + "objective/scores": 9.868490219116211, + "policy/approxkl_avg": 0.17349249124526978, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.3052448034286499, + "step": 160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2224, + "val/ratio": 0.9999510049819946, + "val/ratio_var": 1.918686166391126e-06 + }, + { + "episode": 82944, + "epoch": 0.35531185743660043, + "eps": 6, + "loss/policy_avg": 0.11456018686294556, + "lr": 1.11328125e-06, + "objective/entropy": -45.592594146728516, + "objective/kl": 34.33940124511719, + "objective/non_score_reward": -0.1716970056295395, + "objective/rlhf_reward": 9.983051300048828, + "objective/scores": 10.154748916625977, + "policy/approxkl_avg": 0.23430576920509338, + "policy/clipfrac_avg": 0.353515625, + "policy/entropy_avg": 0.3039991557598114, + "step": 161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2222, + "val/ratio": 1.0001614093780518, + "val/ratio_var": 3.715218781508156e-06 + }, + { + "episode": 83456, + "epoch": 0.35750514050719673, + "eps": 6, + "loss/policy_avg": 0.153703972697258, + "lr": 1.1015625e-06, + "objective/entropy": -44.5072021484375, + "objective/kl": 34.8854866027832, + "objective/non_score_reward": -0.17442743480205536, + "objective/rlhf_reward": 10.038424491882324, + "objective/scores": 10.212852478027344, + "policy/approxkl_avg": 0.22196213901042938, + "policy/clipfrac_avg": 0.32421875, + "policy/entropy_avg": 0.2997274398803711, + "step": 162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2142, + "val/ratio": 0.9998916983604431, + "val/ratio_var": 3.932265371986432e-06 + }, + { + "episode": 83968, + "epoch": 0.359698423577793, + "eps": 6, + "loss/policy_avg": 0.07879342138767242, + "lr": 1.08984375e-06, + "objective/entropy": -44.19832229614258, + "objective/kl": 35.476566314697266, + "objective/non_score_reward": -0.17738284170627594, + "objective/rlhf_reward": 10.00450611114502, + "objective/scores": 10.181888580322266, + "policy/approxkl_avg": 0.20819735527038574, + "policy/clipfrac_avg": 0.326171875, + "policy/entropy_avg": 0.30624184012413025, + "step": 163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2152, + "val/ratio": 0.9998291730880737, + "val/ratio_var": 3.605615347623825e-06 + }, + { + "episode": 84480, + "epoch": 0.3618917066483893, + "eps": 6, + "loss/policy_avg": 0.0724404901266098, + "lr": 1.078125e-06, + "objective/entropy": -47.138023376464844, + "objective/kl": 34.859161376953125, + "objective/non_score_reward": -0.1742957979440689, + "objective/rlhf_reward": 9.974884033203125, + "objective/scores": 10.149181365966797, + "policy/approxkl_avg": 0.23386578261852264, + "policy/clipfrac_avg": 0.353515625, + "policy/entropy_avg": 0.30695077776908875, + "step": 164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2135, + "val/ratio": 1.0000650882720947, + "val/ratio_var": 2.6056186470668763e-06 + }, + { + "episode": 84992, + "epoch": 0.3640849897189856, + "eps": 6, + "loss/policy_avg": 0.1969626247882843, + "lr": 1.06640625e-06, + "objective/entropy": -45.13905334472656, + "objective/kl": 34.560428619384766, + "objective/non_score_reward": -0.1728021502494812, + "objective/rlhf_reward": 10.120126724243164, + "objective/scores": 10.292928695678711, + "policy/approxkl_avg": 0.19709500670433044, + "policy/clipfrac_avg": 0.349609375, + "policy/entropy_avg": 0.2968193292617798, + "step": 165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2101, + "val/ratio": 0.9999551773071289, + "val/ratio_var": 2.9901325433456805e-06 + }, + { + "episode": 85504, + "epoch": 0.3662782727895819, + "eps": 6, + "loss/policy_avg": 0.1212851032614708, + "lr": 1.0546875e-06, + "objective/entropy": -44.7745361328125, + "objective/kl": 36.573219299316406, + "objective/non_score_reward": -0.18286609649658203, + "objective/rlhf_reward": 10.04844856262207, + "objective/scores": 10.231313705444336, + "policy/approxkl_avg": 0.20575228333473206, + "policy/clipfrac_avg": 0.3359375, + "policy/entropy_avg": 0.3033609390258789, + "step": 166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2073, + "val/ratio": 0.9998507499694824, + "val/ratio_var": 3.099663899774896e-06 + }, + { + "episode": 86016, + "epoch": 0.3684715558601782, + "eps": 6, + "loss/policy_avg": 0.14179468154907227, + "lr": 1.04296875e-06, + "objective/entropy": -45.21974182128906, + "objective/kl": 34.958003997802734, + "objective/non_score_reward": -0.17479000985622406, + "objective/rlhf_reward": 10.091043472290039, + "objective/scores": 10.265833854675293, + "policy/approxkl_avg": 0.4719541668891907, + "policy/clipfrac_avg": 0.318359375, + "policy/entropy_avg": 0.30320775508880615, + "step": 167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2072, + "val/ratio": 1.0002262592315674, + "val/ratio_var": 3.7852200875931885e-06 + }, + { + "episode": 86528, + "epoch": 0.3706648389307745, + "eps": 6, + "loss/policy_avg": 0.1506226807832718, + "lr": 1.03125e-06, + "objective/entropy": -46.42851257324219, + "objective/kl": 35.120418548583984, + "objective/non_score_reward": -0.17560207843780518, + "objective/rlhf_reward": 9.96408748626709, + "objective/scores": 10.139688491821289, + "policy/approxkl_avg": 0.20192615687847137, + "policy/clipfrac_avg": 0.3203125, + "policy/entropy_avg": 0.3063679337501526, + "step": 168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2059, + "val/ratio": 0.9996170997619629, + "val/ratio_var": 2.1293217287166044e-06 + }, + { + "episode": 87040, + "epoch": 0.3728581220013708, + "eps": 6, + "loss/policy_avg": 0.1173999235033989, + "lr": 1.01953125e-06, + "objective/entropy": -45.61651611328125, + "objective/kl": 35.65013885498047, + "objective/non_score_reward": -0.17825068533420563, + "objective/rlhf_reward": 10.056770324707031, + "objective/scores": 10.235021591186523, + "policy/approxkl_avg": 0.2224082350730896, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.3040269613265991, + "step": 169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2075, + "val/ratio": 0.9998025894165039, + "val/ratio_var": 2.9224604531918885e-06 + }, + { + "episode": 87552, + "epoch": 0.3750514050719671, + "eps": 6, + "loss/policy_avg": 0.15251114964485168, + "lr": 1.0078125e-06, + "objective/entropy": -42.0252799987793, + "objective/kl": 36.96675109863281, + "objective/non_score_reward": -0.18483373522758484, + "objective/rlhf_reward": 9.96993637084961, + "objective/scores": 10.154769897460938, + "policy/approxkl_avg": 0.3365061581134796, + "policy/clipfrac_avg": 0.34765625, + "policy/entropy_avg": 0.30356451869010925, + "step": 170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2127, + "val/ratio": 1.0000369548797607, + "val/ratio_var": 3.884088982886169e-06 + }, + { + "episode": 88064, + "epoch": 0.3772446881425634, + "eps": 6, + "loss/policy_avg": 0.14639852941036224, + "lr": 9.9609375e-07, + "objective/entropy": -46.64868927001953, + "objective/kl": 35.58274841308594, + "objective/non_score_reward": -0.17791372537612915, + "objective/rlhf_reward": 9.998994827270508, + "objective/scores": 10.176908493041992, + "policy/approxkl_avg": 0.23724576830863953, + "policy/clipfrac_avg": 0.306640625, + "policy/entropy_avg": 0.3028953969478607, + "step": 171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2067, + "val/ratio": 0.9992661476135254, + "val/ratio_var": 2.3232767034642166e-06 + }, + { + "episode": 88576, + "epoch": 0.3794379712131597, + "eps": 6, + "loss/policy_avg": 0.1436213254928589, + "lr": 9.84375e-07, + "objective/entropy": -41.64381408691406, + "objective/kl": 37.166568756103516, + "objective/non_score_reward": -0.18583282828330994, + "objective/rlhf_reward": 9.991978645324707, + "objective/scores": 10.177811622619629, + "policy/approxkl_avg": 0.2451467663049698, + "policy/clipfrac_avg": 0.3359375, + "policy/entropy_avg": 0.3070330321788788, + "step": 172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2059, + "val/ratio": 1.0000195503234863, + "val/ratio_var": 2.694009708648082e-06 + }, + { + "episode": 89088, + "epoch": 0.381631254283756, + "eps": 6, + "loss/policy_avg": 0.1377355307340622, + "lr": 9.7265625e-07, + "objective/entropy": -44.69953918457031, + "objective/kl": 35.26990509033203, + "objective/non_score_reward": -0.17634952068328857, + "objective/rlhf_reward": 10.164739608764648, + "objective/scores": 10.34108829498291, + "policy/approxkl_avg": 0.18719518184661865, + "policy/clipfrac_avg": 0.345703125, + "policy/entropy_avg": 0.29084381461143494, + "step": 173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1986, + "val/ratio": 1.0002484321594238, + "val/ratio_var": 4.782595624419628e-06 + }, + { + "episode": 89600, + "epoch": 0.3838245373543523, + "eps": 6, + "loss/policy_avg": 0.145149827003479, + "lr": 9.609375e-07, + "objective/entropy": -42.52074432373047, + "objective/kl": 36.28486633300781, + "objective/non_score_reward": -0.1814243197441101, + "objective/rlhf_reward": 10.075740814208984, + "objective/scores": 10.25716495513916, + "policy/approxkl_avg": 0.23402442038059235, + "policy/clipfrac_avg": 0.3203125, + "policy/entropy_avg": 0.29287636280059814, + "step": 174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1973, + "val/ratio": 1.000025987625122, + "val/ratio_var": 4.361321771284565e-06 + }, + { + "episode": 90112, + "epoch": 0.3860178204249486, + "eps": 6, + "loss/policy_avg": 0.10543729364871979, + "lr": 9.4921875e-07, + "objective/entropy": -41.868194580078125, + "objective/kl": 37.273529052734375, + "objective/non_score_reward": -0.18636763095855713, + "objective/rlhf_reward": 10.175069808959961, + "objective/scores": 10.36143684387207, + "policy/approxkl_avg": 0.1950404942035675, + "policy/clipfrac_avg": 0.35546875, + "policy/entropy_avg": 0.30170726776123047, + "step": 175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2002, + "val/ratio": 1.000225305557251, + "val/ratio_var": 2.6567659006104805e-06 + }, + { + "episode": 90624, + "epoch": 0.3882111034955449, + "eps": 6, + "loss/policy_avg": 0.17701852321624756, + "lr": 9.375e-07, + "objective/entropy": -41.182373046875, + "objective/kl": 36.40575408935547, + "objective/non_score_reward": -0.18202877044677734, + "objective/rlhf_reward": 10.043168067932129, + "objective/scores": 10.225196838378906, + "policy/approxkl_avg": 0.23861688375473022, + "policy/clipfrac_avg": 0.33203125, + "policy/entropy_avg": 0.29390662908554077, + "step": 176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2005, + "val/ratio": 0.9996527433395386, + "val/ratio_var": 3.4563124700071057e-06 + }, + { + "episode": 91136, + "epoch": 0.3904043865661412, + "eps": 6, + "loss/policy_avg": 0.1414596140384674, + "lr": 9.257812500000001e-07, + "objective/entropy": -39.34553909301758, + "objective/kl": 36.56566619873047, + "objective/non_score_reward": -0.18282833695411682, + "objective/rlhf_reward": 10.158039093017578, + "objective/scores": 10.340867042541504, + "policy/approxkl_avg": 0.2117760330438614, + "policy/clipfrac_avg": 0.341796875, + "policy/entropy_avg": 0.3072349727153778, + "step": 177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1934, + "val/ratio": 1.0001301765441895, + "val/ratio_var": 2.827512389558251e-06 + }, + { + "episode": 91648, + "epoch": 0.3925976696367375, + "eps": 6, + "loss/policy_avg": 0.1428884118795395, + "lr": 9.140625e-07, + "objective/entropy": -42.55487823486328, + "objective/kl": 37.40206527709961, + "objective/non_score_reward": -0.18701031804084778, + "objective/rlhf_reward": 10.196723937988281, + "objective/scores": 10.383734703063965, + "policy/approxkl_avg": 0.2159181535243988, + "policy/clipfrac_avg": 0.361328125, + "policy/entropy_avg": 0.30771517753601074, + "step": 178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2047, + "val/ratio": 1.0000932216644287, + "val/ratio_var": 2.8553140509757213e-06 + }, + { + "episode": 92160, + "epoch": 0.3947909527073338, + "eps": 6, + "loss/policy_avg": 0.149674192070961, + "lr": 9.0234375e-07, + "objective/entropy": -40.15728759765625, + "objective/kl": 38.55815887451172, + "objective/non_score_reward": -0.19279080629348755, + "objective/rlhf_reward": 10.088329315185547, + "objective/scores": 10.281119346618652, + "policy/approxkl_avg": 0.27164649963378906, + "policy/clipfrac_avg": 0.388671875, + "policy/entropy_avg": 0.30614158511161804, + "step": 179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1958, + "val/ratio": 0.999848484992981, + "val/ratio_var": 3.3823580452008173e-06 + }, + { + "episode": 92672, + "epoch": 0.3969842357779301, + "eps": 6, + "loss/policy_avg": 0.09967108070850372, + "lr": 8.90625e-07, + "objective/entropy": -40.094696044921875, + "objective/kl": 39.13507843017578, + "objective/non_score_reward": -0.19567537307739258, + "objective/rlhf_reward": 10.153437614440918, + "objective/scores": 10.349113464355469, + "policy/approxkl_avg": 0.23491740226745605, + "policy/clipfrac_avg": 0.345703125, + "policy/entropy_avg": 0.2897144556045532, + "step": 180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1957, + "val/ratio": 0.999672532081604, + "val/ratio_var": 3.0424698707065545e-06 + }, + { + "episode": 93184, + "epoch": 0.39917751884852637, + "eps": 6, + "loss/policy_avg": 0.14683936536312103, + "lr": 8.7890625e-07, + "objective/entropy": -41.210853576660156, + "objective/kl": 37.40240478515625, + "objective/non_score_reward": -0.18701201677322388, + "objective/rlhf_reward": 10.140878677368164, + "objective/scores": 10.327892303466797, + "policy/approxkl_avg": 0.3033018708229065, + "policy/clipfrac_avg": 0.333984375, + "policy/entropy_avg": 0.2977150082588196, + "step": 181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2024, + "val/ratio": 1.0001099109649658, + "val/ratio_var": 2.969815341202775e-06 + }, + { + "episode": 93696, + "epoch": 0.40137080191912267, + "eps": 6, + "loss/policy_avg": 0.15430837869644165, + "lr": 8.671875e-07, + "objective/entropy": -41.79518508911133, + "objective/kl": 37.84357833862305, + "objective/non_score_reward": -0.18921789526939392, + "objective/rlhf_reward": 10.359935760498047, + "objective/scores": 10.549153327941895, + "policy/approxkl_avg": 0.22429192066192627, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.29293376207351685, + "step": 182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2064, + "val/ratio": 1.0002741813659668, + "val/ratio_var": 2.9459497454809025e-06 + }, + { + "episode": 94208, + "epoch": 0.40356408498971896, + "eps": 6, + "loss/policy_avg": 0.13481080532073975, + "lr": 8.5546875e-07, + "objective/entropy": -42.12849044799805, + "objective/kl": 37.44647216796875, + "objective/non_score_reward": -0.1872323751449585, + "objective/rlhf_reward": 9.978042602539062, + "objective/scores": 10.165275573730469, + "policy/approxkl_avg": 0.23636795580387115, + "policy/clipfrac_avg": 0.36328125, + "policy/entropy_avg": 0.30856287479400635, + "step": 183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2047, + "val/ratio": 0.9998284578323364, + "val/ratio_var": 3.3832554890977917e-06 + }, + { + "episode": 94720, + "epoch": 0.40575736806031526, + "eps": 6, + "loss/policy_avg": 0.12567618489265442, + "lr": 8.4375e-07, + "objective/entropy": -41.77434539794922, + "objective/kl": 37.33880615234375, + "objective/non_score_reward": -0.18669402599334717, + "objective/rlhf_reward": 10.023723602294922, + "objective/scores": 10.210416793823242, + "policy/approxkl_avg": 0.24579544365406036, + "policy/clipfrac_avg": 0.3671875, + "policy/entropy_avg": 0.30206605792045593, + "step": 184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2047, + "val/ratio": 1.0003873109817505, + "val/ratio_var": 3.945482148992596e-06 + }, + { + "episode": 95232, + "epoch": 0.40795065113091156, + "eps": 6, + "loss/policy_avg": 0.1462424248456955, + "lr": 8.3203125e-07, + "objective/entropy": -43.626190185546875, + "objective/kl": 37.824928283691406, + "objective/non_score_reward": -0.18912464380264282, + "objective/rlhf_reward": 9.99215316772461, + "objective/scores": 10.18127727508545, + "policy/approxkl_avg": 0.21663124859333038, + "policy/clipfrac_avg": 0.31640625, + "policy/entropy_avg": 0.28824448585510254, + "step": 185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1988, + "val/ratio": 1.0000272989273071, + "val/ratio_var": 2.306805754415109e-06 + }, + { + "episode": 95744, + "epoch": 0.4101439342015079, + "eps": 6, + "loss/policy_avg": 0.12344589829444885, + "lr": 8.203125e-07, + "objective/entropy": -40.98870086669922, + "objective/kl": 36.16552734375, + "objective/non_score_reward": -0.18082764744758606, + "objective/rlhf_reward": 10.318306922912598, + "objective/scores": 10.499134063720703, + "policy/approxkl_avg": 0.22008797526359558, + "policy/clipfrac_avg": 0.322265625, + "policy/entropy_avg": 0.28174102306365967, + "step": 186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2029, + "val/ratio": 0.9997909069061279, + "val/ratio_var": 2.35360653277894e-06 + }, + { + "episode": 96256, + "epoch": 0.4123372172721042, + "eps": 6, + "loss/policy_avg": 0.11904306709766388, + "lr": 8.085937500000001e-07, + "objective/entropy": -44.73994064331055, + "objective/kl": 36.407371520996094, + "objective/non_score_reward": -0.1820368468761444, + "objective/rlhf_reward": 9.902576446533203, + "objective/scores": 10.084613800048828, + "policy/approxkl_avg": 0.22743195295333862, + "policy/clipfrac_avg": 0.33203125, + "policy/entropy_avg": 0.29516759514808655, + "step": 187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2098, + "val/ratio": 0.9997510313987732, + "val/ratio_var": 3.703849415614968e-06 + }, + { + "episode": 96768, + "epoch": 0.4145305003427005, + "eps": 6, + "loss/policy_avg": 0.21387794613838196, + "lr": 7.96875e-07, + "objective/entropy": -43.10389709472656, + "objective/kl": 36.83638381958008, + "objective/non_score_reward": -0.18418191373348236, + "objective/rlhf_reward": 10.138248443603516, + "objective/scores": 10.322430610656738, + "policy/approxkl_avg": 0.15915729105472565, + "policy/clipfrac_avg": 0.28515625, + "policy/entropy_avg": 0.291852205991745, + "step": 188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2029, + "val/ratio": 0.9999695420265198, + "val/ratio_var": 2.5079311853914987e-06 + }, + { + "episode": 97280, + "epoch": 0.4167237834132968, + "eps": 6, + "loss/policy_avg": 0.19237488508224487, + "lr": 7.8515625e-07, + "objective/entropy": -41.943084716796875, + "objective/kl": 37.63665771484375, + "objective/non_score_reward": -0.1881832778453827, + "objective/rlhf_reward": 10.08050537109375, + "objective/scores": 10.268688201904297, + "policy/approxkl_avg": 0.297230988740921, + "policy/clipfrac_avg": 0.35546875, + "policy/entropy_avg": 0.27060505747795105, + "step": 189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1981, + "val/ratio": 0.9996173977851868, + "val/ratio_var": 3.2602993087493815e-06 + }, + { + "episode": 97792, + "epoch": 0.4189170664838931, + "eps": 6, + "loss/policy_avg": 0.2517552375793457, + "lr": 7.734375000000001e-07, + "objective/entropy": -41.90442657470703, + "objective/kl": 37.97048568725586, + "objective/non_score_reward": -0.18985243141651154, + "objective/rlhf_reward": 10.145000457763672, + "objective/scores": 10.334854125976562, + "policy/approxkl_avg": 0.25615477561950684, + "policy/clipfrac_avg": 0.337890625, + "policy/entropy_avg": 0.2846136689186096, + "step": 190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1973, + "val/ratio": 0.9998682737350464, + "val/ratio_var": 2.448537316013244e-06 + }, + { + "episode": 98304, + "epoch": 0.4211103495544894, + "eps": 6, + "loss/policy_avg": 0.14701473712921143, + "lr": 7.6171875e-07, + "objective/entropy": -41.90221405029297, + "objective/kl": 37.23102569580078, + "objective/non_score_reward": -0.18615514039993286, + "objective/rlhf_reward": 10.309503555297852, + "objective/scores": 10.495659828186035, + "policy/approxkl_avg": 0.1897934526205063, + "policy/clipfrac_avg": 0.310546875, + "policy/entropy_avg": 0.2846633195877075, + "step": 191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1943, + "val/ratio": 0.9998244047164917, + "val/ratio_var": 3.4015531582554104e-06 + }, + { + "episode": 98816, + "epoch": 0.4233036326250857, + "eps": 6, + "loss/policy_avg": 0.1151188462972641, + "lr": 7.5e-07, + "objective/entropy": -42.558258056640625, + "objective/kl": 37.32960891723633, + "objective/non_score_reward": -0.18664804100990295, + "objective/rlhf_reward": 10.326423645019531, + "objective/scores": 10.513072967529297, + "policy/approxkl_avg": 0.20801270008087158, + "policy/clipfrac_avg": 0.349609375, + "policy/entropy_avg": 0.2727763056755066, + "step": 192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2003, + "val/ratio": 0.9997249245643616, + "val/ratio_var": 2.510585090931272e-06 + }, + { + "episode": 99328, + "epoch": 0.425496915695682, + "eps": 6, + "loss/policy_avg": 0.15836259722709656, + "lr": 7.3828125e-07, + "objective/entropy": -41.13874816894531, + "objective/kl": 38.534767150878906, + "objective/non_score_reward": -0.19267383217811584, + "objective/rlhf_reward": 10.352947235107422, + "objective/scores": 10.545621871948242, + "policy/approxkl_avg": 0.18514463305473328, + "policy/clipfrac_avg": 0.32421875, + "policy/entropy_avg": 0.27844926714897156, + "step": 193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2000, + "val/ratio": 1.0003242492675781, + "val/ratio_var": 2.664654402906308e-06 + }, + { + "episode": 99840, + "epoch": 0.4276901987662783, + "eps": 6, + "loss/policy_avg": 0.10841876268386841, + "lr": 7.265625e-07, + "objective/entropy": -42.906219482421875, + "objective/kl": 37.530914306640625, + "objective/non_score_reward": -0.18765458464622498, + "objective/rlhf_reward": 10.194138526916504, + "objective/scores": 10.381793022155762, + "policy/approxkl_avg": 0.21525612473487854, + "policy/clipfrac_avg": 0.3359375, + "policy/entropy_avg": 0.28797751665115356, + "step": 194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1998, + "val/ratio": 0.9999988079071045, + "val/ratio_var": 2.4885796392482007e-06 + }, + { + "episode": 100352, + "epoch": 0.4298834818368746, + "eps": 6, + "loss/policy_avg": 0.10143588483333588, + "lr": 7.1484375e-07, + "objective/entropy": -40.330078125, + "objective/kl": 38.33295440673828, + "objective/non_score_reward": -0.19166478514671326, + "objective/rlhf_reward": 10.228219985961914, + "objective/scores": 10.419885635375977, + "policy/approxkl_avg": 0.20035621523857117, + "policy/clipfrac_avg": 0.33984375, + "policy/entropy_avg": 0.2809005379676819, + "step": 195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2004, + "val/ratio": 1.0001542568206787, + "val/ratio_var": 2.452593435009476e-06 + }, + { + "episode": 100864, + "epoch": 0.4320767649074709, + "eps": 6, + "loss/policy_avg": 0.12953543663024902, + "lr": 7.03125e-07, + "objective/entropy": -41.211490631103516, + "objective/kl": 39.28241729736328, + "objective/non_score_reward": -0.1964120864868164, + "objective/rlhf_reward": 10.131875991821289, + "objective/scores": 10.328287124633789, + "policy/approxkl_avg": 0.2059415578842163, + "policy/clipfrac_avg": 0.33203125, + "policy/entropy_avg": 0.2694932818412781, + "step": 196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1956, + "val/ratio": 1.0002844333648682, + "val/ratio_var": 2.822038823069306e-06 + }, + { + "episode": 101376, + "epoch": 0.4342700479780672, + "eps": 6, + "loss/policy_avg": 0.09656757861375809, + "lr": 6.9140625e-07, + "objective/entropy": -39.60321044921875, + "objective/kl": 40.691993713378906, + "objective/non_score_reward": -0.2034599781036377, + "objective/rlhf_reward": 10.239164352416992, + "objective/scores": 10.442625045776367, + "policy/approxkl_avg": 0.18502315878868103, + "policy/clipfrac_avg": 0.31640625, + "policy/entropy_avg": 0.2789159119129181, + "step": 197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1987, + "val/ratio": 1.000156044960022, + "val/ratio_var": 2.2366070879797917e-06 + }, + { + "episode": 101888, + "epoch": 0.43646333104866347, + "eps": 6, + "loss/policy_avg": 0.0859491229057312, + "lr": 6.796875e-07, + "objective/entropy": -38.96410369873047, + "objective/kl": 41.062564849853516, + "objective/non_score_reward": -0.2053128331899643, + "objective/rlhf_reward": 10.23694133758545, + "objective/scores": 10.442254066467285, + "policy/approxkl_avg": 0.24058808386325836, + "policy/clipfrac_avg": 0.353515625, + "policy/entropy_avg": 0.2734566330909729, + "step": 198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1992, + "val/ratio": 1.0001670122146606, + "val/ratio_var": 3.191878931829706e-06 + }, + { + "episode": 102400, + "epoch": 0.43865661411925977, + "eps": 6, + "loss/policy_avg": 0.11006498336791992, + "lr": 6.6796875e-07, + "objective/entropy": -38.98827362060547, + "objective/kl": 39.47478103637695, + "objective/non_score_reward": -0.1973738968372345, + "objective/rlhf_reward": 10.272533416748047, + "objective/scores": 10.469907760620117, + "policy/approxkl_avg": 0.20590484142303467, + "policy/clipfrac_avg": 0.326171875, + "policy/entropy_avg": 0.27251720428466797, + "step": 199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1942, + "val/ratio": 0.9998576641082764, + "val/ratio_var": 2.986185336339986e-06 + }, + { + "episode": 102912, + "epoch": 0.44084989718985607, + "eps": 6, + "loss/policy_avg": 0.12146441638469696, + "lr": 6.562500000000001e-07, + "objective/entropy": -40.963356018066406, + "objective/kl": 38.48386001586914, + "objective/non_score_reward": -0.19241929054260254, + "objective/rlhf_reward": 10.313898086547852, + "objective/scores": 10.506318092346191, + "policy/approxkl_avg": 0.1644008457660675, + "policy/clipfrac_avg": 0.314453125, + "policy/entropy_avg": 0.2655995488166809, + "step": 200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1928, + "val/ratio": 0.9998116493225098, + "val/ratio_var": 2.1452240162034286e-06 + }, + { + "episode": 103424, + "epoch": 0.44304318026045236, + "eps": 6, + "loss/policy_avg": 0.1258418709039688, + "lr": 6.4453125e-07, + "objective/entropy": -40.115211486816406, + "objective/kl": 40.336639404296875, + "objective/non_score_reward": -0.2016831934452057, + "objective/rlhf_reward": 10.239288330078125, + "objective/scores": 10.440971374511719, + "policy/approxkl_avg": 0.20189166069030762, + "policy/clipfrac_avg": 0.326171875, + "policy/entropy_avg": 0.27117398381233215, + "step": 201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1955, + "val/ratio": 1.0002821683883667, + "val/ratio_var": 2.51285405283852e-06 + }, + { + "episode": 103936, + "epoch": 0.44523646333104866, + "eps": 6, + "loss/policy_avg": 0.17538924515247345, + "lr": 6.328125e-07, + "objective/entropy": -38.41377639770508, + "objective/kl": 40.93872833251953, + "objective/non_score_reward": -0.20469364523887634, + "objective/rlhf_reward": 10.182136535644531, + "objective/scores": 10.386829376220703, + "policy/approxkl_avg": 0.17323625087738037, + "policy/clipfrac_avg": 0.318359375, + "policy/entropy_avg": 0.27593186497688293, + "step": 202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1908, + "val/ratio": 0.9999833703041077, + "val/ratio_var": 1.960131157829892e-06 + }, + { + "episode": 104448, + "epoch": 0.44742974640164496, + "eps": 6, + "loss/policy_avg": 0.14555960893630981, + "lr": 6.210937500000001e-07, + "objective/entropy": -39.35943603515625, + "objective/kl": 40.14939498901367, + "objective/non_score_reward": -0.20074696838855743, + "objective/rlhf_reward": 10.305095672607422, + "objective/scores": 10.505842208862305, + "policy/approxkl_avg": 0.20699313282966614, + "policy/clipfrac_avg": 0.3125, + "policy/entropy_avg": 0.26498085260391235, + "step": 203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1865, + "val/ratio": 1.0000905990600586, + "val/ratio_var": 2.7077380764239933e-06 + }, + { + "episode": 104960, + "epoch": 0.44962302947224125, + "eps": 6, + "loss/policy_avg": 0.16808803379535675, + "lr": 6.09375e-07, + "objective/entropy": -38.82417297363281, + "objective/kl": 40.72381591796875, + "objective/non_score_reward": -0.20361904799938202, + "objective/rlhf_reward": 10.05816650390625, + "objective/scores": 10.261785507202148, + "policy/approxkl_avg": 0.20535245537757874, + "policy/clipfrac_avg": 0.296875, + "policy/entropy_avg": 0.26682859659194946, + "step": 204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1901, + "val/ratio": 1.000447392463684, + "val/ratio_var": 4.010019893030403e-06 + }, + { + "episode": 105472, + "epoch": 0.45181631254283755, + "eps": 6, + "loss/policy_avg": 0.13649414479732513, + "lr": 5.9765625e-07, + "objective/entropy": -38.72916030883789, + "objective/kl": 39.52435302734375, + "objective/non_score_reward": -0.19762176275253296, + "objective/rlhf_reward": 10.200495719909668, + "objective/scores": 10.398117065429688, + "policy/approxkl_avg": 0.19853393733501434, + "policy/clipfrac_avg": 0.333984375, + "policy/entropy_avg": 0.24932250380516052, + "step": 205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1857, + "val/ratio": 1.0000267028808594, + "val/ratio_var": 2.8564525109686656e-06 + }, + { + "episode": 105984, + "epoch": 0.45400959561343385, + "eps": 6, + "loss/policy_avg": 0.1474711298942566, + "lr": 5.859375000000001e-07, + "objective/entropy": -39.11212921142578, + "objective/kl": 40.107582092285156, + "objective/non_score_reward": -0.20053791999816895, + "objective/rlhf_reward": 10.051382064819336, + "objective/scores": 10.25191879272461, + "policy/approxkl_avg": 0.18098586797714233, + "policy/clipfrac_avg": 0.265625, + "policy/entropy_avg": 0.2650277018547058, + "step": 206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1945, + "val/ratio": 1.0001517534255981, + "val/ratio_var": 2.2215056105778785e-06 + }, + { + "episode": 106496, + "epoch": 0.45620287868403014, + "eps": 6, + "loss/policy_avg": 0.21073520183563232, + "lr": 5.7421875e-07, + "objective/entropy": -38.74761962890625, + "objective/kl": 40.78113555908203, + "objective/non_score_reward": -0.20390570163726807, + "objective/rlhf_reward": 10.191478729248047, + "objective/scores": 10.395383834838867, + "policy/approxkl_avg": 0.21191664040088654, + "policy/clipfrac_avg": 0.328125, + "policy/entropy_avg": 0.2553490400314331, + "step": 207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1871, + "val/ratio": 1.0000207424163818, + "val/ratio_var": 2.455075900797965e-06 + }, + { + "episode": 107008, + "epoch": 0.45839616175462644, + "eps": 6, + "loss/policy_avg": 0.12934532761573792, + "lr": 5.625e-07, + "objective/entropy": -39.1215705871582, + "objective/kl": 40.07392501831055, + "objective/non_score_reward": -0.20036959648132324, + "objective/rlhf_reward": 10.169363021850586, + "objective/scores": 10.369732856750488, + "policy/approxkl_avg": 0.17855875194072723, + "policy/clipfrac_avg": 0.302734375, + "policy/entropy_avg": 0.26004987955093384, + "step": 208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1911, + "val/ratio": 1.000025987625122, + "val/ratio_var": 1.8282569271832472e-06 + }, + { + "episode": 107520, + "epoch": 0.46058944482522274, + "eps": 6, + "loss/policy_avg": 0.31929153203964233, + "lr": 5.5078125e-07, + "objective/entropy": -38.290008544921875, + "objective/kl": 39.87298583984375, + "objective/non_score_reward": -0.19936493039131165, + "objective/rlhf_reward": 10.168830871582031, + "objective/scores": 10.368196487426758, + "policy/approxkl_avg": 0.19574999809265137, + "policy/clipfrac_avg": 0.275390625, + "policy/entropy_avg": 0.24393311142921448, + "step": 209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1875, + "val/ratio": 0.9998703598976135, + "val/ratio_var": 2.899037554016104e-06 + }, + { + "episode": 108032, + "epoch": 0.46278272789581903, + "eps": 6, + "loss/policy_avg": 0.14277383685112, + "lr": 5.390625e-07, + "objective/entropy": -36.05535125732422, + "objective/kl": 41.62527084350586, + "objective/non_score_reward": -0.20812633633613586, + "objective/rlhf_reward": 10.23617172241211, + "objective/scores": 10.444297790527344, + "policy/approxkl_avg": 0.20949417352676392, + "policy/clipfrac_avg": 0.28515625, + "policy/entropy_avg": 0.2431659698486328, + "step": 210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1849, + "val/ratio": 1.000133991241455, + "val/ratio_var": 2.837778311004513e-06 + }, + { + "episode": 108544, + "epoch": 0.46497601096641533, + "eps": 6, + "loss/policy_avg": 0.08451884984970093, + "lr": 5.2734375e-07, + "objective/entropy": -37.417503356933594, + "objective/kl": 40.52192687988281, + "objective/non_score_reward": -0.20260962843894958, + "objective/rlhf_reward": 10.291685104370117, + "objective/scores": 10.494295120239258, + "policy/approxkl_avg": 0.1809503734111786, + "policy/clipfrac_avg": 0.302734375, + "policy/entropy_avg": 0.24842746555805206, + "step": 211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1889, + "val/ratio": 1.0000265836715698, + "val/ratio_var": 2.840351498889504e-06 + }, + { + "episode": 109056, + "epoch": 0.46716929403701163, + "eps": 6, + "loss/policy_avg": 0.1416936069726944, + "lr": 5.15625e-07, + "objective/entropy": -36.85235595703125, + "objective/kl": 40.77280044555664, + "objective/non_score_reward": -0.2038639932870865, + "objective/rlhf_reward": 10.16319465637207, + "objective/scores": 10.367058753967285, + "policy/approxkl_avg": 0.23493634164333344, + "policy/clipfrac_avg": 0.3515625, + "policy/entropy_avg": 0.25094282627105713, + "step": 212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1922, + "val/ratio": 1.0000346899032593, + "val/ratio_var": 2.971406729557202e-06 + }, + { + "episode": 109568, + "epoch": 0.4693625771076079, + "eps": 6, + "loss/policy_avg": 0.06791745126247406, + "lr": 5.0390625e-07, + "objective/entropy": -36.092864990234375, + "objective/kl": 41.685611724853516, + "objective/non_score_reward": -0.2084280550479889, + "objective/rlhf_reward": 10.266704559326172, + "objective/scores": 10.475132942199707, + "policy/approxkl_avg": 0.16744546592235565, + "policy/clipfrac_avg": 0.298828125, + "policy/entropy_avg": 0.2529471814632416, + "step": 213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1911, + "val/ratio": 0.9999880790710449, + "val/ratio_var": 2.514202151360223e-06 + } + ], + "logging_steps": 1, + "max_steps": 128, + "num_input_tokens_seen": 0, + "num_train_epochs": 1.122960932145305, + "save_steps": 26, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": true, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0, + "train_batch_size": null, + "trial_name": null, + "trial_params": null +}