diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23868 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 15284, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.270111183780249e-09, + "logits/chosen": -2.634561777114868, + "logits/rejected": -2.673060417175293, + "logps/chosen": -207.5323944091797, + "logps/rejected": -286.9266052246094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 3.270111183780249e-08, + "logits/chosen": -2.217697858810425, + "logits/rejected": -1.9652551412582397, + "logps/chosen": -186.06736755371094, + "logps/rejected": -165.34738159179688, + "loss": 0.6932, + "rewards/accuracies": 0.1666666716337204, + "rewards/chosen": -0.0014695884892717004, + "rewards/margins": -0.002330251270905137, + "rewards/rejected": 0.0008606627234257758, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 6.540222367560497e-08, + "logits/chosen": -2.4319119453430176, + "logits/rejected": -2.2228429317474365, + "logps/chosen": -232.4527587890625, + "logps/rejected": -231.435546875, + "loss": 0.6931, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -4.6026078052818775e-05, + "rewards/margins": 7.92687205830589e-05, + "rewards/rejected": -0.0001252948131877929, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 9.810333551340746e-08, + "logits/chosen": -2.25822377204895, + "logits/rejected": -2.162461996078491, + "logps/chosen": -197.378173828125, + "logps/rejected": -219.074951171875, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00045980390859767795, + "rewards/margins": 0.00029530542087741196, + "rewards/rejected": 0.0001644986041355878, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 1.3080444735120995e-07, + "logits/chosen": -2.211453914642334, + "logits/rejected": -2.251152992248535, + "logps/chosen": -276.0306701660156, + "logps/rejected": -265.74371337890625, + "loss": 0.6931, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0004297545528970659, + "rewards/margins": -0.0005864914273843169, + "rewards/rejected": 0.00015673683083150536, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 1.6350555918901243e-07, + "logits/chosen": -2.3490729331970215, + "logits/rejected": -2.1418588161468506, + "logps/chosen": -204.78414916992188, + "logps/rejected": -184.72738647460938, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0002141618897439912, + "rewards/margins": 0.0005374640459194779, + "rewards/rejected": -0.0003233022871427238, + "step": 50 + }, + { + "epoch": 0.0, + "learning_rate": 1.9620667102681492e-07, + "logits/chosen": -2.30709171295166, + "logits/rejected": -2.0678863525390625, + "logps/chosen": -209.7256622314453, + "logps/rejected": -185.79867553710938, + "loss": 0.6932, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0008024474373087287, + "rewards/margins": -0.0010456187883391976, + "rewards/rejected": 0.00184806645847857, + "step": 60 + }, + { + "epoch": 0.0, + "learning_rate": 2.289077828646174e-07, + "logits/chosen": -2.2695107460021973, + "logits/rejected": -2.1568443775177, + "logps/chosen": -218.00942993164062, + "logps/rejected": -207.9250946044922, + "loss": 0.6932, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.00100115523673594, + "rewards/margins": -0.00015484937466681004, + "rewards/rejected": 0.00115600461140275, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 2.616088947024199e-07, + "logits/chosen": -2.5082268714904785, + "logits/rejected": -2.227625608444214, + "logps/chosen": -258.78826904296875, + "logps/rejected": -213.654541015625, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0016212640330195427, + "rewards/margins": 0.00033446805900894105, + "rewards/rejected": 0.001286796061322093, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 2.943100065402224e-07, + "logits/chosen": -2.25887393951416, + "logits/rejected": -2.173290252685547, + "logps/chosen": -184.6951141357422, + "logps/rejected": -165.476806640625, + "loss": 0.6932, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0008967015892267227, + "rewards/margins": 0.00010184949496760964, + "rewards/rejected": 0.0007948519778437912, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 3.2701111837802487e-07, + "logits/chosen": -2.4311330318450928, + "logits/rejected": -2.425891399383545, + "logps/chosen": -168.7532196044922, + "logps/rejected": -183.79940795898438, + "loss": 0.6932, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.0018166687805205584, + "rewards/margins": -0.00107972149271518, + "rewards/rejected": 0.0028963901568204165, + "step": 100 + }, + { + "epoch": 0.01, + "eval_logits/chosen": -2.348848581314087, + "eval_logits/rejected": -2.1603763103485107, + "eval_logps/chosen": -231.76463317871094, + "eval_logps/rejected": -211.4439239501953, + "eval_loss": 0.6931134462356567, + "eval_rewards/accuracies": 0.4950000047683716, + "eval_rewards/chosen": 0.0024031461216509342, + "eval_rewards/margins": 0.0007233246578834951, + "eval_rewards/rejected": 0.0016798212891444564, + "eval_runtime": 707.3142, + "eval_samples_per_second": 2.828, + "eval_steps_per_second": 1.414, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 3.5971223021582736e-07, + "logits/chosen": -2.347839593887329, + "logits/rejected": -1.9999773502349854, + "logps/chosen": -222.35336303710938, + "logps/rejected": -166.99118041992188, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004421192221343517, + "rewards/margins": 0.0019870258402079344, + "rewards/rejected": 0.002434166381135583, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 3.9241334205362984e-07, + "logits/chosen": -2.340026378631592, + "logits/rejected": -2.244414806365967, + "logps/chosen": -223.99569702148438, + "logps/rejected": -234.1189422607422, + "loss": 0.6931, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.00296420999802649, + "rewards/margins": 0.0004412824346218258, + "rewards/rejected": 0.002522927476093173, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 4.251144538914324e-07, + "logits/chosen": -2.2618038654327393, + "logits/rejected": -2.217468738555908, + "logps/chosen": -149.3894500732422, + "logps/rejected": -148.2598114013672, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0025640667881816626, + "rewards/margins": 0.00097900559194386, + "rewards/rejected": 0.0015850610798224807, + "step": 130 + }, + { + "epoch": 0.01, + "learning_rate": 4.578155657292348e-07, + "logits/chosen": -2.3222999572753906, + "logits/rejected": -2.2233099937438965, + "logps/chosen": -225.5967254638672, + "logps/rejected": -159.4678955078125, + "loss": 0.6931, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0036222212947905064, + "rewards/margins": 0.0013900110498070717, + "rewards/rejected": 0.0022322097793221474, + "step": 140 + }, + { + "epoch": 0.01, + "learning_rate": 4.905166775670374e-07, + "logits/chosen": -2.367203950881958, + "logits/rejected": -2.1586971282958984, + "logps/chosen": -230.97109985351562, + "logps/rejected": -229.11178588867188, + "loss": 0.6929, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.003845545928925276, + "rewards/margins": 0.003386072115972638, + "rewards/rejected": 0.00045947395847178996, + "step": 150 + }, + { + "epoch": 0.01, + "learning_rate": 5.232177894048398e-07, + "logits/chosen": -2.2155299186706543, + "logits/rejected": -2.2280077934265137, + "logps/chosen": -260.2898864746094, + "logps/rejected": -224.85397338867188, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.005785978864878416, + "rewards/margins": 0.004330903757363558, + "rewards/rejected": 0.0014550751075148582, + "step": 160 + }, + { + "epoch": 0.01, + "learning_rate": 5.559189012426422e-07, + "logits/chosen": -2.3153624534606934, + "logits/rejected": -2.0374207496643066, + "logps/chosen": -180.40968322753906, + "logps/rejected": -156.81607055664062, + "loss": 0.6928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004428991116583347, + "rewards/margins": 0.0036846441216766834, + "rewards/rejected": 0.0007443467038683593, + "step": 170 + }, + { + "epoch": 0.01, + "learning_rate": 5.886200130804448e-07, + "logits/chosen": -2.3993406295776367, + "logits/rejected": -2.3392200469970703, + "logps/chosen": -217.6866455078125, + "logps/rejected": -198.7965850830078, + "loss": 0.6929, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.005419188644737005, + "rewards/margins": 0.005340488161891699, + "rewards/rejected": 7.869987894082442e-05, + "step": 180 + }, + { + "epoch": 0.01, + "learning_rate": 6.213211249182473e-07, + "logits/chosen": -2.0705583095550537, + "logits/rejected": -2.175136089324951, + "logps/chosen": -191.1099853515625, + "logps/rejected": -208.73690795898438, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.003862470854073763, + "rewards/margins": 0.004704989958554506, + "rewards/rejected": -0.0008425191044807434, + "step": 190 + }, + { + "epoch": 0.01, + "learning_rate": 6.540222367560497e-07, + "logits/chosen": -2.27732253074646, + "logits/rejected": -2.24127197265625, + "logps/chosen": -146.89163208007812, + "logps/rejected": -177.7828826904297, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0032042518723756075, + "rewards/margins": 0.0057475874200463295, + "rewards/rejected": -0.0025433353148400784, + "step": 200 + }, + { + "epoch": 0.01, + "eval_logits/chosen": -2.3492345809936523, + "eval_logits/rejected": -2.1607820987701416, + "eval_logps/chosen": -231.4797821044922, + "eval_logps/rejected": -211.65264892578125, + "eval_loss": 0.6928496360778809, + "eval_rewards/accuracies": 0.5835000276565552, + "eval_rewards/chosen": 0.005251556169241667, + "eval_rewards/margins": 0.0056591029278934, + "eval_rewards/rejected": -0.0004075466131325811, + "eval_runtime": 708.7402, + "eval_samples_per_second": 2.822, + "eval_steps_per_second": 1.411, + "step": 200 + }, + { + "epoch": 0.01, + "learning_rate": 6.867233485938523e-07, + "logits/chosen": -2.4269886016845703, + "logits/rejected": -2.2028229236602783, + "logps/chosen": -218.64584350585938, + "logps/rejected": -188.28201293945312, + "loss": 0.6928, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.008876695297658443, + "rewards/margins": 0.010037838481366634, + "rewards/rejected": -0.0011611439986154437, + "step": 210 + }, + { + "epoch": 0.01, + "learning_rate": 7.194244604316547e-07, + "logits/chosen": -2.2129268646240234, + "logits/rejected": -2.0407798290252686, + "logps/chosen": -182.85243225097656, + "logps/rejected": -174.8861083984375, + "loss": 0.693, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.004839606583118439, + "rewards/margins": 0.004267896059900522, + "rewards/rejected": 0.000571710173971951, + "step": 220 + }, + { + "epoch": 0.02, + "learning_rate": 7.521255722694571e-07, + "logits/chosen": -2.421215534210205, + "logits/rejected": -2.025574207305908, + "logps/chosen": -278.98101806640625, + "logps/rejected": -184.08096313476562, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007983444258570671, + "rewards/margins": 0.005552899558097124, + "rewards/rejected": 0.0024305458646267653, + "step": 230 + }, + { + "epoch": 0.02, + "learning_rate": 7.848266841072597e-07, + "logits/chosen": -2.199948310852051, + "logits/rejected": -2.164917469024658, + "logps/chosen": -214.61245727539062, + "logps/rejected": -206.2373504638672, + "loss": 0.6923, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01238412968814373, + "rewards/margins": 0.01192299835383892, + "rewards/rejected": 0.0004611331969499588, + "step": 240 + }, + { + "epoch": 0.02, + "learning_rate": 8.175277959450622e-07, + "logits/chosen": -2.1677653789520264, + "logits/rejected": -2.3267643451690674, + "logps/chosen": -218.1819610595703, + "logps/rejected": -220.6886444091797, + "loss": 0.6926, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022441856563091278, + "rewards/margins": 0.009667301550507545, + "rewards/rejected": 0.012774554081261158, + "step": 250 + }, + { + "epoch": 0.02, + "learning_rate": 8.502289077828648e-07, + "logits/chosen": -2.5052707195281982, + "logits/rejected": -2.144476890563965, + "logps/chosen": -254.23361206054688, + "logps/rejected": -189.0504608154297, + "loss": 0.6926, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02921391651034355, + "rewards/margins": 0.011754143051803112, + "rewards/rejected": 0.017459776252508163, + "step": 260 + }, + { + "epoch": 0.02, + "learning_rate": 8.829300196206672e-07, + "logits/chosen": -2.422799587249756, + "logits/rejected": -2.1278483867645264, + "logps/chosen": -246.3038330078125, + "logps/rejected": -230.50228881835938, + "loss": 0.6923, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02507014013826847, + "rewards/margins": 0.014279340393841267, + "rewards/rejected": 0.01079079881310463, + "step": 270 + }, + { + "epoch": 0.02, + "learning_rate": 9.156311314584696e-07, + "logits/chosen": -2.3067777156829834, + "logits/rejected": -2.1929802894592285, + "logps/chosen": -159.95680236816406, + "logps/rejected": -146.39175415039062, + "loss": 0.6928, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.022316502407193184, + "rewards/margins": 0.009042134508490562, + "rewards/rejected": 0.013274368830025196, + "step": 280 + }, + { + "epoch": 0.02, + "learning_rate": 9.483322432962722e-07, + "logits/chosen": -2.555990695953369, + "logits/rejected": -2.1609978675842285, + "logps/chosen": -282.0716857910156, + "logps/rejected": -225.60147094726562, + "loss": 0.693, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.031071314588189125, + "rewards/margins": 0.010234272107481956, + "rewards/rejected": 0.02083704061806202, + "step": 290 + }, + { + "epoch": 0.02, + "learning_rate": 9.810333551340747e-07, + "logits/chosen": -2.3414573669433594, + "logits/rejected": -2.197089433670044, + "logps/chosen": -265.0999755859375, + "logps/rejected": -238.67355346679688, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.035596780478954315, + "rewards/margins": 0.022229164838790894, + "rewards/rejected": 0.013367618434131145, + "step": 300 + }, + { + "epoch": 0.02, + "eval_logits/chosen": -2.3535282611846924, + "eval_logits/rejected": -2.164868116378784, + "eval_logps/chosen": -228.6509246826172, + "eval_logps/rejected": -209.84600830078125, + "eval_loss": 0.6924605965614319, + "eval_rewards/accuracies": 0.5830000042915344, + "eval_rewards/chosen": 0.03354022651910782, + "eval_rewards/margins": 0.015881428495049477, + "eval_rewards/rejected": 0.017658798024058342, + "eval_runtime": 705.9246, + "eval_samples_per_second": 2.833, + "eval_steps_per_second": 1.417, + "step": 300 + }, + { + "epoch": 0.02, + "learning_rate": 1.0137344669718771e-06, + "logits/chosen": -2.350961208343506, + "logits/rejected": -2.365408420562744, + "logps/chosen": -166.96469116210938, + "logps/rejected": -155.90208435058594, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.025256266817450523, + "rewards/margins": 0.005879827309399843, + "rewards/rejected": 0.019376439973711967, + "step": 310 + }, + { + "epoch": 0.02, + "learning_rate": 1.0464355788096796e-06, + "logits/chosen": -2.455401659011841, + "logits/rejected": -2.0602850914001465, + "logps/chosen": -221.2332305908203, + "logps/rejected": -192.01666259765625, + "loss": 0.6924, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03931882977485657, + "rewards/margins": 0.015343300998210907, + "rewards/rejected": 0.02397553250193596, + "step": 320 + }, + { + "epoch": 0.02, + "learning_rate": 1.079136690647482e-06, + "logits/chosen": -2.4358632564544678, + "logits/rejected": -2.2039878368377686, + "logps/chosen": -203.50843811035156, + "logps/rejected": -175.61509704589844, + "loss": 0.6925, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04902844503521919, + "rewards/margins": 0.022538715973496437, + "rewards/rejected": 0.026489730924367905, + "step": 330 + }, + { + "epoch": 0.02, + "learning_rate": 1.1118378024852844e-06, + "logits/chosen": -2.187950372695923, + "logits/rejected": -2.353523015975952, + "logps/chosen": -150.39413452148438, + "logps/rejected": -177.75634765625, + "loss": 0.6933, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.033202774822711945, + "rewards/margins": -0.0008061464759521186, + "rewards/rejected": 0.034008920192718506, + "step": 340 + }, + { + "epoch": 0.02, + "learning_rate": 1.144538914323087e-06, + "logits/chosen": -2.426027774810791, + "logits/rejected": -1.987079381942749, + "logps/chosen": -317.85028076171875, + "logps/rejected": -247.85317993164062, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.042603787034749985, + "rewards/margins": 0.02849414013326168, + "rewards/rejected": 0.014109638519585133, + "step": 350 + }, + { + "epoch": 0.02, + "learning_rate": 1.1772400261608895e-06, + "logits/chosen": -2.4899606704711914, + "logits/rejected": -2.1962363719940186, + "logps/chosen": -220.11160278320312, + "logps/rejected": -192.32504272460938, + "loss": 0.6919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.053684353828430176, + "rewards/margins": 0.023779017850756645, + "rewards/rejected": 0.02990533970296383, + "step": 360 + }, + { + "epoch": 0.02, + "learning_rate": 1.2099411379986922e-06, + "logits/chosen": -2.1702442169189453, + "logits/rejected": -2.2464897632598877, + "logps/chosen": -192.40721130371094, + "logps/rejected": -206.269287109375, + "loss": 0.6929, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.05089518427848816, + "rewards/margins": 0.010943805798888206, + "rewards/rejected": 0.0399513766169548, + "step": 370 + }, + { + "epoch": 0.02, + "learning_rate": 1.2426422498364946e-06, + "logits/chosen": -2.3307952880859375, + "logits/rejected": -2.042811393737793, + "logps/chosen": -216.83474731445312, + "logps/rejected": -161.6975860595703, + "loss": 0.6926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04813487082719803, + "rewards/margins": 0.024402152746915817, + "rewards/rejected": 0.02373271808028221, + "step": 380 + }, + { + "epoch": 0.03, + "learning_rate": 1.2753433616742968e-06, + "logits/chosen": -2.329103946685791, + "logits/rejected": -2.249396562576294, + "logps/chosen": -181.30165100097656, + "logps/rejected": -244.35458374023438, + "loss": 0.6919, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.039993561804294586, + "rewards/margins": 0.023708099499344826, + "rewards/rejected": 0.01628546044230461, + "step": 390 + }, + { + "epoch": 0.03, + "learning_rate": 1.3080444735120995e-06, + "logits/chosen": -2.490332841873169, + "logits/rejected": -2.120997428894043, + "logps/chosen": -220.3374481201172, + "logps/rejected": -178.84103393554688, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05155234411358833, + "rewards/margins": 0.0375380739569664, + "rewards/rejected": 0.014014266431331635, + "step": 400 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.354814052581787, + "eval_logits/rejected": -2.1659581661224365, + "eval_logps/chosen": -227.3407745361328, + "eval_logps/rejected": -209.3865509033203, + "eval_loss": 0.6920492649078369, + "eval_rewards/accuracies": 0.6019999980926514, + "eval_rewards/chosen": 0.04664193466305733, + "eval_rewards/margins": 0.02438831515610218, + "eval_rewards/rejected": 0.022253619506955147, + "eval_runtime": 707.4391, + "eval_samples_per_second": 2.827, + "eval_steps_per_second": 1.414, + "step": 400 + }, + { + "epoch": 0.03, + "learning_rate": 1.3407455853499021e-06, + "logits/chosen": -2.4750816822052, + "logits/rejected": -2.324173927307129, + "logps/chosen": -257.1285095214844, + "logps/rejected": -223.27047729492188, + "loss": 0.692, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05224750190973282, + "rewards/margins": 0.025383714586496353, + "rewards/rejected": 0.026863792911171913, + "step": 410 + }, + { + "epoch": 0.03, + "learning_rate": 1.3734466971877046e-06, + "logits/chosen": -2.3002800941467285, + "logits/rejected": -2.1997017860412598, + "logps/chosen": -177.15582275390625, + "logps/rejected": -172.67115783691406, + "loss": 0.6924, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04763823747634888, + "rewards/margins": 0.029840771108865738, + "rewards/rejected": 0.01779746450483799, + "step": 420 + }, + { + "epoch": 0.03, + "learning_rate": 1.406147809025507e-06, + "logits/chosen": -2.2951817512512207, + "logits/rejected": -2.1071887016296387, + "logps/chosen": -209.26852416992188, + "logps/rejected": -180.22879028320312, + "loss": 0.691, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05878716707229614, + "rewards/margins": 0.03459661826491356, + "rewards/rejected": 0.024190548807382584, + "step": 430 + }, + { + "epoch": 0.03, + "learning_rate": 1.4388489208633094e-06, + "logits/chosen": -2.3935062885284424, + "logits/rejected": -2.108060121536255, + "logps/chosen": -249.3080596923828, + "logps/rejected": -221.1660919189453, + "loss": 0.6926, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05189204961061478, + "rewards/margins": 0.017697608098387718, + "rewards/rejected": 0.03419443964958191, + "step": 440 + }, + { + "epoch": 0.03, + "learning_rate": 1.471550032701112e-06, + "logits/chosen": -2.412630558013916, + "logits/rejected": -2.2085201740264893, + "logps/chosen": -184.7351531982422, + "logps/rejected": -194.98269653320312, + "loss": 0.6905, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.04912736266851425, + "rewards/margins": 0.04446180909872055, + "rewards/rejected": 0.004665557760745287, + "step": 450 + }, + { + "epoch": 0.03, + "learning_rate": 1.5042511445389143e-06, + "logits/chosen": -2.159377336502075, + "logits/rejected": -2.205676555633545, + "logps/chosen": -149.52809143066406, + "logps/rejected": -215.054931640625, + "loss": 0.6904, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.014119003899395466, + "rewards/margins": 0.02100800909101963, + "rewards/rejected": -0.006889003328979015, + "step": 460 + }, + { + "epoch": 0.03, + "learning_rate": 1.536952256376717e-06, + "logits/chosen": -2.049952983856201, + "logits/rejected": -2.111029624938965, + "logps/chosen": -199.77395629882812, + "logps/rejected": -250.3234405517578, + "loss": 0.6901, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.011566747911274433, + "rewards/margins": 0.0498543456196785, + "rewards/rejected": -0.03828759863972664, + "step": 470 + }, + { + "epoch": 0.03, + "learning_rate": 1.5696533682145194e-06, + "logits/chosen": -2.448256731033325, + "logits/rejected": -2.215850353240967, + "logps/chosen": -178.30746459960938, + "logps/rejected": -151.38975524902344, + "loss": 0.6909, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.05939297750592232, + "rewards/margins": 0.04141029715538025, + "rewards/rejected": 0.01798268035054207, + "step": 480 + }, + { + "epoch": 0.03, + "learning_rate": 1.602354480052322e-06, + "logits/chosen": -2.4165821075439453, + "logits/rejected": -2.3280694484710693, + "logps/chosen": -259.0218811035156, + "logps/rejected": -207.7999267578125, + "loss": 0.6911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07617911696434021, + "rewards/margins": 0.03729747608304024, + "rewards/rejected": 0.03888164460659027, + "step": 490 + }, + { + "epoch": 0.03, + "learning_rate": 1.6350555918901245e-06, + "logits/chosen": -2.21075701713562, + "logits/rejected": -1.8819067478179932, + "logps/chosen": -213.0443572998047, + "logps/rejected": -203.5986328125, + "loss": 0.6917, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05024771764874458, + "rewards/margins": 0.0509779155254364, + "rewards/rejected": -0.0007301971199922264, + "step": 500 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.349929094314575, + "eval_logits/rejected": -2.1615867614746094, + "eval_logps/chosen": -225.62718200683594, + "eval_logps/rejected": -209.42613220214844, + "eval_loss": 0.6916049718856812, + "eval_rewards/accuracies": 0.6060000061988831, + "eval_rewards/chosen": 0.06377778202295303, + "eval_rewards/margins": 0.04191993921995163, + "eval_rewards/rejected": 0.021857835352420807, + "eval_runtime": 704.9072, + "eval_samples_per_second": 2.837, + "eval_steps_per_second": 1.419, + "step": 500 + }, + { + "epoch": 0.03, + "learning_rate": 1.6677567037279269e-06, + "logits/chosen": -2.4886791706085205, + "logits/rejected": -2.138219118118286, + "logps/chosen": -287.70098876953125, + "logps/rejected": -252.02603149414062, + "loss": 0.6929, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.085626520216465, + "rewards/margins": 0.04710019752383232, + "rewards/rejected": 0.038526326417922974, + "step": 510 + }, + { + "epoch": 0.03, + "learning_rate": 1.7004578155657295e-06, + "logits/chosen": -2.2930407524108887, + "logits/rejected": -2.316594362258911, + "logps/chosen": -199.58074951171875, + "logps/rejected": -185.34860229492188, + "loss": 0.6915, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0713859498500824, + "rewards/margins": 0.061885036528110504, + "rewards/rejected": 0.009500918909907341, + "step": 520 + }, + { + "epoch": 0.03, + "learning_rate": 1.7331589274035318e-06, + "logits/chosen": -2.1680140495300293, + "logits/rejected": -2.0505690574645996, + "logps/chosen": -180.614990234375, + "logps/rejected": -186.27236938476562, + "loss": 0.692, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0331156924366951, + "rewards/margins": 0.06211583688855171, + "rewards/rejected": -0.029000144451856613, + "step": 530 + }, + { + "epoch": 0.04, + "learning_rate": 1.7658600392413344e-06, + "logits/chosen": -2.396062135696411, + "logits/rejected": -2.2367682456970215, + "logps/chosen": -199.58575439453125, + "logps/rejected": -187.2077178955078, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.032696597278118134, + "rewards/margins": 0.049915581941604614, + "rewards/rejected": -0.017218980938196182, + "step": 540 + }, + { + "epoch": 0.04, + "learning_rate": 1.7985611510791368e-06, + "logits/chosen": -2.396876811981201, + "logits/rejected": -1.942486047744751, + "logps/chosen": -256.23150634765625, + "logps/rejected": -247.9710235595703, + "loss": 0.692, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0731818825006485, + "rewards/margins": 0.08128007501363754, + "rewards/rejected": -0.008098193444311619, + "step": 550 + }, + { + "epoch": 0.04, + "learning_rate": 1.8312622629169393e-06, + "logits/chosen": -2.3080027103424072, + "logits/rejected": -2.1690850257873535, + "logps/chosen": -257.26641845703125, + "logps/rejected": -226.06912231445312, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.06187974289059639, + "rewards/margins": 0.03323299437761307, + "rewards/rejected": 0.028646748512983322, + "step": 560 + }, + { + "epoch": 0.04, + "learning_rate": 1.8639633747547417e-06, + "logits/chosen": -2.3971519470214844, + "logits/rejected": -2.228024482727051, + "logps/chosen": -224.38430786132812, + "logps/rejected": -191.48605346679688, + "loss": 0.6914, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04760807007551193, + "rewards/margins": 0.027385840192437172, + "rewards/rejected": 0.02022222802042961, + "step": 570 + }, + { + "epoch": 0.04, + "learning_rate": 1.8966644865925443e-06, + "logits/chosen": -2.246232748031616, + "logits/rejected": -2.271991729736328, + "logps/chosen": -234.4510498046875, + "logps/rejected": -250.18887329101562, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04757726192474365, + "rewards/margins": 0.043433815240859985, + "rewards/rejected": 0.0041434429585933685, + "step": 580 + }, + { + "epoch": 0.04, + "learning_rate": 1.9293655984303466e-06, + "logits/chosen": -2.657580852508545, + "logits/rejected": -2.2350101470947266, + "logps/chosen": -282.17510986328125, + "logps/rejected": -204.83383178710938, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.007911397144198418, + "rewards/margins": 0.056634120643138885, + "rewards/rejected": -0.04872272536158562, + "step": 590 + }, + { + "epoch": 0.04, + "learning_rate": 1.9620667102681494e-06, + "logits/chosen": -2.393247127532959, + "logits/rejected": -2.4066872596740723, + "logps/chosen": -187.5821533203125, + "logps/rejected": -175.9160614013672, + "loss": 0.6919, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.016703059896826744, + "rewards/margins": 0.026545118540525436, + "rewards/rejected": -0.009842058643698692, + "step": 600 + }, + { + "epoch": 0.04, + "eval_logits/chosen": -2.3568413257598877, + "eval_logits/rejected": -2.1674532890319824, + "eval_logps/chosen": -227.02455139160156, + "eval_logps/rejected": -211.3561248779297, + "eval_loss": 0.6913270354270935, + "eval_rewards/accuracies": 0.597000002861023, + "eval_rewards/chosen": 0.04980393126606941, + "eval_rewards/margins": 0.04724626615643501, + "eval_rewards/rejected": 0.00255767023190856, + "eval_runtime": 708.4382, + "eval_samples_per_second": 2.823, + "eval_steps_per_second": 1.412, + "step": 600 + }, + { + "epoch": 0.04, + "learning_rate": 1.994767822105952e-06, + "logits/chosen": -2.306018829345703, + "logits/rejected": -2.208861827850342, + "logps/chosen": -171.64891052246094, + "logps/rejected": -183.595947265625, + "loss": 0.6893, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.03554733842611313, + "rewards/margins": 0.055120062083005905, + "rewards/rejected": -0.019572719931602478, + "step": 610 + }, + { + "epoch": 0.04, + "learning_rate": 2.0274689339437543e-06, + "logits/chosen": -2.233059883117676, + "logits/rejected": -2.0187458992004395, + "logps/chosen": -270.9899597167969, + "logps/rejected": -238.4331817626953, + "loss": 0.6921, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07699786126613617, + "rewards/margins": 0.027974560856819153, + "rewards/rejected": 0.04902329668402672, + "step": 620 + }, + { + "epoch": 0.04, + "learning_rate": 2.0601700457815567e-06, + "logits/chosen": -2.382429361343384, + "logits/rejected": -2.015047788619995, + "logps/chosen": -253.5185089111328, + "logps/rejected": -211.54025268554688, + "loss": 0.6927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.05836200714111328, + "rewards/margins": 0.015349363908171654, + "rewards/rejected": 0.04301264509558678, + "step": 630 + }, + { + "epoch": 0.04, + "learning_rate": 2.092871157619359e-06, + "logits/chosen": -2.4374704360961914, + "logits/rejected": -2.250887393951416, + "logps/chosen": -169.18408203125, + "logps/rejected": -187.0492706298828, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07478086650371552, + "rewards/margins": 0.041536975651979446, + "rewards/rejected": 0.03324388712644577, + "step": 640 + }, + { + "epoch": 0.04, + "learning_rate": 2.1255722694571616e-06, + "logits/chosen": -2.44903302192688, + "logits/rejected": -2.05329966545105, + "logps/chosen": -271.42828369140625, + "logps/rejected": -182.59605407714844, + "loss": 0.6922, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07466375082731247, + "rewards/margins": 0.041353899985551834, + "rewards/rejected": 0.033309854567050934, + "step": 650 + }, + { + "epoch": 0.04, + "learning_rate": 2.158273381294964e-06, + "logits/chosen": -2.3471286296844482, + "logits/rejected": -2.2432503700256348, + "logps/chosen": -207.7042236328125, + "logps/rejected": -214.0726776123047, + "loss": 0.6938, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.04722776263952255, + "rewards/margins": 0.008569743484258652, + "rewards/rejected": 0.0386580154299736, + "step": 660 + }, + { + "epoch": 0.04, + "learning_rate": 2.190974493132767e-06, + "logits/chosen": -2.3504185676574707, + "logits/rejected": -2.0488924980163574, + "logps/chosen": -233.1539306640625, + "logps/rejected": -181.9641571044922, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04525148868560791, + "rewards/margins": 0.05067021772265434, + "rewards/rejected": -0.005418726243078709, + "step": 670 + }, + { + "epoch": 0.04, + "learning_rate": 2.223675604970569e-06, + "logits/chosen": -2.4212958812713623, + "logits/rejected": -2.2168402671813965, + "logps/chosen": -204.07693481445312, + "logps/rejected": -190.05075073242188, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.06657350063323975, + "rewards/margins": 0.033041100949048996, + "rewards/rejected": 0.03353239223361015, + "step": 680 + }, + { + "epoch": 0.05, + "learning_rate": 2.2563767168083718e-06, + "logits/chosen": -2.495441436767578, + "logits/rejected": -2.002847194671631, + "logps/chosen": -250.1965789794922, + "logps/rejected": -191.36761474609375, + "loss": 0.6911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07055240124464035, + "rewards/margins": 0.04203369468450546, + "rewards/rejected": 0.028518706560134888, + "step": 690 + }, + { + "epoch": 0.05, + "learning_rate": 2.289077828646174e-06, + "logits/chosen": -2.32529354095459, + "logits/rejected": -2.090954542160034, + "logps/chosen": -234.6331329345703, + "logps/rejected": -214.04269409179688, + "loss": 0.6909, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04427819699048996, + "rewards/margins": 0.04115144535899162, + "rewards/rejected": 0.0031267497688531876, + "step": 700 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.350055456161499, + "eval_logits/rejected": -2.161485433578491, + "eval_logps/chosen": -226.39279174804688, + "eval_logps/rejected": -210.55441284179688, + "eval_loss": 0.6913213133811951, + "eval_rewards/accuracies": 0.6144999861717224, + "eval_rewards/chosen": 0.056121550500392914, + "eval_rewards/margins": 0.04554666578769684, + "eval_rewards/rejected": 0.010574882850050926, + "eval_runtime": 706.9392, + "eval_samples_per_second": 2.829, + "eval_steps_per_second": 1.415, + "step": 700 + }, + { + "epoch": 0.05, + "learning_rate": 2.3217789404839766e-06, + "logits/chosen": -2.2059988975524902, + "logits/rejected": -2.29160737991333, + "logps/chosen": -154.5279541015625, + "logps/rejected": -204.50648498535156, + "loss": 0.691, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.023031946271657944, + "rewards/margins": 0.03035845421254635, + "rewards/rejected": -0.007326505146920681, + "step": 710 + }, + { + "epoch": 0.05, + "learning_rate": 2.354480052321779e-06, + "logits/chosen": -2.52923321723938, + "logits/rejected": -2.1005005836486816, + "logps/chosen": -253.52212524414062, + "logps/rejected": -201.50367736816406, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.048673033714294434, + "rewards/margins": 0.0762287974357605, + "rewards/rejected": -0.027555758133530617, + "step": 720 + }, + { + "epoch": 0.05, + "learning_rate": 2.3871811641595815e-06, + "logits/chosen": -2.3575785160064697, + "logits/rejected": -2.1780619621276855, + "logps/chosen": -252.4840087890625, + "logps/rejected": -198.02867126464844, + "loss": 0.6921, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.056915633380413055, + "rewards/margins": 0.042005524039268494, + "rewards/rejected": -0.09892116487026215, + "step": 730 + }, + { + "epoch": 0.05, + "learning_rate": 2.4198822759973843e-06, + "logits/chosen": -2.1879830360412598, + "logits/rejected": -2.2039308547973633, + "logps/chosen": -209.80526733398438, + "logps/rejected": -226.54446411132812, + "loss": 0.6909, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19199207425117493, + "rewards/margins": 0.028066366910934448, + "rewards/rejected": -0.220058411359787, + "step": 740 + }, + { + "epoch": 0.05, + "learning_rate": 2.4525833878351864e-06, + "logits/chosen": -2.4402458667755127, + "logits/rejected": -2.2395517826080322, + "logps/chosen": -271.01220703125, + "logps/rejected": -208.87350463867188, + "loss": 0.6917, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18211308121681213, + "rewards/margins": 0.04141292721033096, + "rewards/rejected": -0.2235260307788849, + "step": 750 + }, + { + "epoch": 0.05, + "learning_rate": 2.4852844996729892e-06, + "logits/chosen": -2.1985716819763184, + "logits/rejected": -2.10333514213562, + "logps/chosen": -263.1232604980469, + "logps/rejected": -269.29693603515625, + "loss": 0.6907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23998942971229553, + "rewards/margins": 0.04715558513998985, + "rewards/rejected": -0.2871449887752533, + "step": 760 + }, + { + "epoch": 0.05, + "learning_rate": 2.5179856115107916e-06, + "logits/chosen": -2.3487696647644043, + "logits/rejected": -2.030247211456299, + "logps/chosen": -293.29290771484375, + "logps/rejected": -244.9794158935547, + "loss": 0.6894, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1979665905237198, + "rewards/margins": 0.056348543614149094, + "rewards/rejected": -0.2543151080608368, + "step": 770 + }, + { + "epoch": 0.05, + "learning_rate": 2.5506867233485937e-06, + "logits/chosen": -2.4455208778381348, + "logits/rejected": -2.0224125385284424, + "logps/chosen": -275.57421875, + "logps/rejected": -228.21737670898438, + "loss": 0.6901, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09878290444612503, + "rewards/margins": 0.10141804069280624, + "rewards/rejected": -0.20020096004009247, + "step": 780 + }, + { + "epoch": 0.05, + "learning_rate": 2.5833878351863965e-06, + "logits/chosen": -2.4715168476104736, + "logits/rejected": -2.403022050857544, + "logps/chosen": -257.7041931152344, + "logps/rejected": -262.29986572265625, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05706269294023514, + "rewards/margins": 0.07216020673513412, + "rewards/rejected": -0.12922289967536926, + "step": 790 + }, + { + "epoch": 0.05, + "learning_rate": 2.616088947024199e-06, + "logits/chosen": -2.155651092529297, + "logits/rejected": -2.314833164215088, + "logps/chosen": -206.03012084960938, + "logps/rejected": -235.5647430419922, + "loss": 0.6913, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09540672600269318, + "rewards/margins": 0.050082337111234665, + "rewards/rejected": -0.14548906683921814, + "step": 800 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.330695390701294, + "eval_logits/rejected": -2.142754316329956, + "eval_logps/chosen": -242.4707794189453, + "eval_logps/rejected": -227.2015838623047, + "eval_loss": 0.6913210153579712, + "eval_rewards/accuracies": 0.597000002861023, + "eval_rewards/chosen": -0.1046583503484726, + "eval_rewards/margins": 0.051238518208265305, + "eval_rewards/rejected": -0.155896857380867, + "eval_runtime": 705.7692, + "eval_samples_per_second": 2.834, + "eval_steps_per_second": 1.417, + "step": 800 + }, + { + "epoch": 0.05, + "learning_rate": 2.6487900588620014e-06, + "logits/chosen": -2.1080448627471924, + "logits/rejected": -1.8789002895355225, + "logps/chosen": -209.4484405517578, + "logps/rejected": -169.5839080810547, + "loss": 0.6932, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.09332854300737381, + "rewards/margins": 0.02387315407395363, + "rewards/rejected": -0.11720170080661774, + "step": 810 + }, + { + "epoch": 0.05, + "learning_rate": 2.6814911706998042e-06, + "logits/chosen": -2.293308734893799, + "logits/rejected": -2.1208784580230713, + "logps/chosen": -232.0146484375, + "logps/rejected": -204.10064697265625, + "loss": 0.6928, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06675419211387634, + "rewards/margins": 0.028399985283613205, + "rewards/rejected": -0.09515418112277985, + "step": 820 + }, + { + "epoch": 0.05, + "learning_rate": 2.7141922825376067e-06, + "logits/chosen": -2.1897878646850586, + "logits/rejected": -2.195380210876465, + "logps/chosen": -264.03271484375, + "logps/rejected": -280.67877197265625, + "loss": 0.6917, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06108593940734863, + "rewards/margins": 0.06715109199285507, + "rewards/rejected": -0.1282370388507843, + "step": 830 + }, + { + "epoch": 0.05, + "learning_rate": 2.746893394375409e-06, + "logits/chosen": -2.3309171199798584, + "logits/rejected": -2.2094645500183105, + "logps/chosen": -239.76480102539062, + "logps/rejected": -241.4962921142578, + "loss": 0.6918, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.053703807294368744, + "rewards/margins": 0.03512780740857124, + "rewards/rejected": -0.08883161842823029, + "step": 840 + }, + { + "epoch": 0.06, + "learning_rate": 2.779594506213211e-06, + "logits/chosen": -2.236567258834839, + "logits/rejected": -2.002687692642212, + "logps/chosen": -194.21266174316406, + "logps/rejected": -198.6126708984375, + "loss": 0.6918, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04353756457567215, + "rewards/margins": 0.047133009880781174, + "rewards/rejected": -0.09067057073116302, + "step": 850 + }, + { + "epoch": 0.06, + "learning_rate": 2.812295618051014e-06, + "logits/chosen": -2.4146676063537598, + "logits/rejected": -2.2537002563476562, + "logps/chosen": -283.446533203125, + "logps/rejected": -226.4826202392578, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.024853792041540146, + "rewards/margins": 0.04763239994645119, + "rewards/rejected": -0.07248619943857193, + "step": 860 + }, + { + "epoch": 0.06, + "learning_rate": 2.8449967298888164e-06, + "logits/chosen": -2.299940347671509, + "logits/rejected": -2.095834732055664, + "logps/chosen": -180.71926879882812, + "logps/rejected": -155.87872314453125, + "loss": 0.6921, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03895248472690582, + "rewards/margins": 0.03217850998044014, + "rewards/rejected": -0.07113099843263626, + "step": 870 + }, + { + "epoch": 0.06, + "learning_rate": 2.877697841726619e-06, + "logits/chosen": -2.38145112991333, + "logits/rejected": -2.282527208328247, + "logps/chosen": -221.8948211669922, + "logps/rejected": -208.37954711914062, + "loss": 0.6917, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03059009648859501, + "rewards/margins": 0.03299534320831299, + "rewards/rejected": -0.06358544528484344, + "step": 880 + }, + { + "epoch": 0.06, + "learning_rate": 2.9103989535644217e-06, + "logits/chosen": -2.3034727573394775, + "logits/rejected": -2.4145941734313965, + "logps/chosen": -219.39413452148438, + "logps/rejected": -252.17977905273438, + "loss": 0.6925, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03910509869456291, + "rewards/margins": 0.0225521232932806, + "rewards/rejected": -0.06165723130106926, + "step": 890 + }, + { + "epoch": 0.06, + "learning_rate": 2.943100065402224e-06, + "logits/chosen": -2.3486392498016357, + "logits/rejected": -2.202688455581665, + "logps/chosen": -297.0926208496094, + "logps/rejected": -285.2032470703125, + "loss": 0.6921, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04449423402547836, + "rewards/margins": 0.03427337482571602, + "rewards/rejected": -0.07876761257648468, + "step": 900 + }, + { + "epoch": 0.06, + "eval_logits/chosen": -2.3343303203582764, + "eval_logits/rejected": -2.146639347076416, + "eval_logps/chosen": -237.26773071289062, + "eval_logps/rejected": -221.73362731933594, + "eval_loss": 0.6909335255622864, + "eval_rewards/accuracies": 0.6060000061988831, + "eval_rewards/chosen": -0.05262775719165802, + "eval_rewards/margins": 0.04858950152993202, + "eval_rewards/rejected": -0.10121726244688034, + "eval_runtime": 708.5552, + "eval_samples_per_second": 2.823, + "eval_steps_per_second": 1.411, + "step": 900 + }, + { + "epoch": 0.06, + "learning_rate": 2.9758011772400266e-06, + "logits/chosen": -2.2568647861480713, + "logits/rejected": -2.2534162998199463, + "logps/chosen": -263.49017333984375, + "logps/rejected": -273.72900390625, + "loss": 0.6923, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06180988624691963, + "rewards/margins": 0.028642665594816208, + "rewards/rejected": -0.09045255184173584, + "step": 910 + }, + { + "epoch": 0.06, + "learning_rate": 3.0085022890778286e-06, + "logits/chosen": -2.3166985511779785, + "logits/rejected": -2.0645124912261963, + "logps/chosen": -181.6991424560547, + "logps/rejected": -157.57730102539062, + "loss": 0.6918, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05959589406847954, + "rewards/margins": 0.02801639400422573, + "rewards/rejected": -0.08761228621006012, + "step": 920 + }, + { + "epoch": 0.06, + "learning_rate": 3.0412034009156314e-06, + "logits/chosen": -2.258695363998413, + "logits/rejected": -2.4108707904815674, + "logps/chosen": -240.5486297607422, + "logps/rejected": -240.31802368164062, + "loss": 0.6922, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04862401634454727, + "rewards/margins": 0.040310461074113846, + "rewards/rejected": -0.08893446624279022, + "step": 930 + }, + { + "epoch": 0.06, + "learning_rate": 3.073904512753434e-06, + "logits/chosen": -2.4486923217773438, + "logits/rejected": -2.0814051628112793, + "logps/chosen": -231.5814208984375, + "logps/rejected": -202.97706604003906, + "loss": 0.6904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.01932818815112114, + "rewards/margins": 0.06323892623186111, + "rewards/rejected": -0.08256711810827255, + "step": 940 + }, + { + "epoch": 0.06, + "learning_rate": 3.1066056245912363e-06, + "logits/chosen": -2.308929920196533, + "logits/rejected": -2.383852958679199, + "logps/chosen": -233.48721313476562, + "logps/rejected": -207.38430786132812, + "loss": 0.6911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.025032073259353638, + "rewards/margins": 0.05856107547879219, + "rewards/rejected": -0.08359314501285553, + "step": 950 + }, + { + "epoch": 0.06, + "learning_rate": 3.1393067364290387e-06, + "logits/chosen": -2.3422508239746094, + "logits/rejected": -2.1356518268585205, + "logps/chosen": -227.9677734375, + "logps/rejected": -198.36422729492188, + "loss": 0.6908, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.028410235419869423, + "rewards/margins": 0.08982036262750626, + "rewards/rejected": -0.11823059618473053, + "step": 960 + }, + { + "epoch": 0.06, + "learning_rate": 3.1720078482668416e-06, + "logits/chosen": -2.2739663124084473, + "logits/rejected": -2.042900562286377, + "logps/chosen": -214.7518310546875, + "logps/rejected": -176.07008361816406, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.031811777502298355, + "rewards/margins": 0.08425115048885345, + "rewards/rejected": -0.11606292426586151, + "step": 970 + }, + { + "epoch": 0.06, + "learning_rate": 3.204708960104644e-06, + "logits/chosen": -2.3709425926208496, + "logits/rejected": -2.0238471031188965, + "logps/chosen": -218.3297119140625, + "logps/rejected": -194.80873107910156, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009269696660339832, + "rewards/margins": 0.09066729247570038, + "rewards/rejected": -0.09993697702884674, + "step": 980 + }, + { + "epoch": 0.06, + "learning_rate": 3.237410071942446e-06, + "logits/chosen": -2.4116098880767822, + "logits/rejected": -2.1458096504211426, + "logps/chosen": -208.81069946289062, + "logps/rejected": -177.1934051513672, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.024496430531144142, + "rewards/margins": 0.01946501061320305, + "rewards/rejected": -0.04396144300699234, + "step": 990 + }, + { + "epoch": 0.07, + "learning_rate": 3.270111183780249e-06, + "logits/chosen": -2.3320465087890625, + "logits/rejected": -2.106236219406128, + "logps/chosen": -201.52783203125, + "logps/rejected": -188.52328491210938, + "loss": 0.6903, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.015042750164866447, + "rewards/margins": 0.04998449236154556, + "rewards/rejected": -0.06502724438905716, + "step": 1000 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.3452556133270264, + "eval_logits/rejected": -2.157487154006958, + "eval_logps/chosen": -232.08251953125, + "eval_logps/rejected": -217.237060546875, + "eval_loss": 0.6907655000686646, + "eval_rewards/accuracies": 0.6184999942779541, + "eval_rewards/chosen": -0.0007757164421491325, + "eval_rewards/margins": 0.05547565594315529, + "eval_rewards/rejected": -0.05625137314200401, + "eval_runtime": 709.7867, + "eval_samples_per_second": 2.818, + "eval_steps_per_second": 1.409, + "step": 1000 + }, + { + "epoch": 0.07, + "learning_rate": 3.3028122956180513e-06, + "logits/chosen": -2.202904462814331, + "logits/rejected": -2.257141351699829, + "logps/chosen": -211.8687744140625, + "logps/rejected": -244.3396759033203, + "loss": 0.6906, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0022913392167538404, + "rewards/margins": 0.05001994967460632, + "rewards/rejected": -0.047728605568408966, + "step": 1010 + }, + { + "epoch": 0.07, + "learning_rate": 3.3355134074558538e-06, + "logits/chosen": -2.197819948196411, + "logits/rejected": -2.1015243530273438, + "logps/chosen": -222.99203491210938, + "logps/rejected": -202.92996215820312, + "loss": 0.6929, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.022592799738049507, + "rewards/margins": 0.0552542582154274, + "rewards/rejected": -0.07784706354141235, + "step": 1020 + }, + { + "epoch": 0.07, + "learning_rate": 3.368214519293656e-06, + "logits/chosen": -2.1490020751953125, + "logits/rejected": -1.9399007558822632, + "logps/chosen": -200.19967651367188, + "logps/rejected": -184.1601104736328, + "loss": 0.6919, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.012293432839214802, + "rewards/margins": 0.04858936741948128, + "rewards/rejected": -0.060882795602083206, + "step": 1030 + }, + { + "epoch": 0.07, + "learning_rate": 3.400915631131459e-06, + "logits/chosen": -2.1254327297210693, + "logits/rejected": -2.1563661098480225, + "logps/chosen": -200.5404052734375, + "logps/rejected": -237.29293823242188, + "loss": 0.6895, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.025679444894194603, + "rewards/margins": 0.08101227134466171, + "rewards/rejected": -0.10669170320034027, + "step": 1040 + }, + { + "epoch": 0.07, + "learning_rate": 3.4336167429692615e-06, + "logits/chosen": -2.3749256134033203, + "logits/rejected": -2.1596624851226807, + "logps/chosen": -216.2527618408203, + "logps/rejected": -205.42910766601562, + "loss": 0.6904, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.017023511230945587, + "rewards/margins": 0.054301291704177856, + "rewards/rejected": -0.07132480293512344, + "step": 1050 + }, + { + "epoch": 0.07, + "learning_rate": 3.4663178548070635e-06, + "logits/chosen": -2.1830973625183105, + "logits/rejected": -2.239147424697876, + "logps/chosen": -214.9247589111328, + "logps/rejected": -203.87339782714844, + "loss": 0.6902, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0002921056002378464, + "rewards/margins": 0.053903043270111084, + "rewards/rejected": -0.05361093208193779, + "step": 1060 + }, + { + "epoch": 0.07, + "learning_rate": 3.499018966644866e-06, + "logits/chosen": -2.3346736431121826, + "logits/rejected": -2.139986038208008, + "logps/chosen": -193.098388671875, + "logps/rejected": -192.88113403320312, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012733638286590576, + "rewards/margins": 0.06595107167959213, + "rewards/rejected": -0.0786847174167633, + "step": 1070 + }, + { + "epoch": 0.07, + "learning_rate": 3.531720078482669e-06, + "logits/chosen": -2.2687487602233887, + "logits/rejected": -1.93032968044281, + "logps/chosen": -225.96875, + "logps/rejected": -222.5755615234375, + "loss": 0.6908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0009903141763061285, + "rewards/margins": 0.07838054746389389, + "rewards/rejected": -0.07937086373567581, + "step": 1080 + }, + { + "epoch": 0.07, + "learning_rate": 3.5644211903204712e-06, + "logits/chosen": -2.2722859382629395, + "logits/rejected": -2.3190114498138428, + "logps/chosen": -195.7224884033203, + "logps/rejected": -204.1152801513672, + "loss": 0.6904, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.031599871814250946, + "rewards/margins": 0.05579303950071335, + "rewards/rejected": -0.024193167686462402, + "step": 1090 + }, + { + "epoch": 0.07, + "learning_rate": 3.5971223021582737e-06, + "logits/chosen": -2.429405927658081, + "logits/rejected": -2.290315866470337, + "logps/chosen": -257.29278564453125, + "logps/rejected": -203.24649047851562, + "loss": 0.6922, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.014919871464371681, + "rewards/margins": 0.030690353363752365, + "rewards/rejected": -0.045610226690769196, + "step": 1100 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.3150722980499268, + "eval_logits/rejected": -2.1294045448303223, + "eval_logps/chosen": -232.15647888183594, + "eval_logps/rejected": -219.40240478515625, + "eval_loss": 0.691135585308075, + "eval_rewards/accuracies": 0.6274999976158142, + "eval_rewards/chosen": -0.0015151738189160824, + "eval_rewards/margins": 0.07638993859291077, + "eval_rewards/rejected": -0.07790511101484299, + "eval_runtime": 706.9471, + "eval_samples_per_second": 2.829, + "eval_steps_per_second": 1.415, + "step": 1100 + }, + { + "epoch": 0.07, + "learning_rate": 3.6298234139960765e-06, + "logits/chosen": -2.3906145095825195, + "logits/rejected": -2.036782741546631, + "logps/chosen": -216.01156616210938, + "logps/rejected": -173.99920654296875, + "loss": 0.6892, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.017574917525053024, + "rewards/margins": 0.07658959180116653, + "rewards/rejected": -0.05901466682553291, + "step": 1110 + }, + { + "epoch": 0.07, + "learning_rate": 3.6625245258338785e-06, + "logits/chosen": -2.191586971282959, + "logits/rejected": -2.110297679901123, + "logps/chosen": -233.83700561523438, + "logps/rejected": -320.0827331542969, + "loss": 0.6854, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0494137704372406, + "rewards/margins": 0.09785051643848419, + "rewards/rejected": -0.04843674600124359, + "step": 1120 + }, + { + "epoch": 0.07, + "learning_rate": 3.695225637671681e-06, + "logits/chosen": -2.442321300506592, + "logits/rejected": -2.241528272628784, + "logps/chosen": -205.39053344726562, + "logps/rejected": -178.85687255859375, + "loss": 0.6897, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.018860872834920883, + "rewards/margins": 0.06542352586984634, + "rewards/rejected": -0.04656265676021576, + "step": 1130 + }, + { + "epoch": 0.07, + "learning_rate": 3.7279267495094834e-06, + "logits/chosen": -2.358170986175537, + "logits/rejected": -2.220486879348755, + "logps/chosen": -159.06790161132812, + "logps/rejected": -177.91981506347656, + "loss": 0.6895, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.013590176589787006, + "rewards/margins": 0.08726686984300613, + "rewards/rejected": -0.0736766904592514, + "step": 1140 + }, + { + "epoch": 0.08, + "learning_rate": 3.7606278613472863e-06, + "logits/chosen": -2.4245522022247314, + "logits/rejected": -2.0788369178771973, + "logps/chosen": -277.3457946777344, + "logps/rejected": -205.87890625, + "loss": 0.6928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04925479739904404, + "rewards/margins": 0.061012279242277145, + "rewards/rejected": -0.01175748836249113, + "step": 1150 + }, + { + "epoch": 0.08, + "learning_rate": 3.7933289731850887e-06, + "logits/chosen": -2.227184772491455, + "logits/rejected": -2.006626605987549, + "logps/chosen": -220.76419067382812, + "logps/rejected": -200.02944946289062, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07796461135149002, + "rewards/margins": 0.07523587346076965, + "rewards/rejected": 0.002728732768446207, + "step": 1160 + }, + { + "epoch": 0.08, + "learning_rate": 3.826030085022891e-06, + "logits/chosen": -2.394782543182373, + "logits/rejected": -2.1593916416168213, + "logps/chosen": -244.3356475830078, + "logps/rejected": -212.73583984375, + "loss": 0.6916, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0331416018307209, + "rewards/margins": 0.04552285000681877, + "rewards/rejected": -0.012381250038743019, + "step": 1170 + }, + { + "epoch": 0.08, + "learning_rate": 3.858731196860693e-06, + "logits/chosen": -2.4333834648132324, + "logits/rejected": -1.9844642877578735, + "logps/chosen": -249.99893188476562, + "logps/rejected": -210.4248504638672, + "loss": 0.6925, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.008344946429133415, + "rewards/margins": 0.026615191251039505, + "rewards/rejected": -0.01827024295926094, + "step": 1180 + }, + { + "epoch": 0.08, + "learning_rate": 3.891432308698496e-06, + "logits/chosen": -2.120863437652588, + "logits/rejected": -2.063352108001709, + "logps/chosen": -193.93240356445312, + "logps/rejected": -226.5436553955078, + "loss": 0.6876, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01297982782125473, + "rewards/margins": 0.07365990430116653, + "rewards/rejected": -0.060680072754621506, + "step": 1190 + }, + { + "epoch": 0.08, + "learning_rate": 3.924133420536299e-06, + "logits/chosen": -2.2395856380462646, + "logits/rejected": -2.041903018951416, + "logps/chosen": -184.75523376464844, + "logps/rejected": -157.53878784179688, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.026105085387825966, + "rewards/margins": 0.061862241476774216, + "rewards/rejected": -0.08796733617782593, + "step": 1200 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -2.3272151947021484, + "eval_logits/rejected": -2.1397786140441895, + "eval_logps/chosen": -234.7645263671875, + "eval_logps/rejected": -221.4020538330078, + "eval_loss": 0.6907363533973694, + "eval_rewards/accuracies": 0.637499988079071, + "eval_rewards/chosen": -0.027595827355980873, + "eval_rewards/margins": 0.07030569016933441, + "eval_rewards/rejected": -0.09790151566267014, + "eval_runtime": 706.2261, + "eval_samples_per_second": 2.832, + "eval_steps_per_second": 1.416, + "step": 1200 + }, + { + "epoch": 0.08, + "learning_rate": 3.956834532374101e-06, + "logits/chosen": -2.4392762184143066, + "logits/rejected": -2.0458781719207764, + "logps/chosen": -206.30532836914062, + "logps/rejected": -173.46829223632812, + "loss": 0.6875, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.010426463559269905, + "rewards/margins": 0.11114281415939331, + "rewards/rejected": -0.12156929075717926, + "step": 1210 + }, + { + "epoch": 0.08, + "learning_rate": 3.989535644211904e-06, + "logits/chosen": -2.329332113265991, + "logits/rejected": -2.0607223510742188, + "logps/chosen": -216.85635375976562, + "logps/rejected": -202.00527954101562, + "loss": 0.6905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.00270930677652359, + "rewards/margins": 0.07769973576068878, + "rewards/rejected": -0.07499042898416519, + "step": 1220 + }, + { + "epoch": 0.08, + "learning_rate": 4.022236756049706e-06, + "logits/chosen": -2.5103940963745117, + "logits/rejected": -2.1222851276397705, + "logps/chosen": -273.70758056640625, + "logps/rejected": -246.398681640625, + "loss": 0.691, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03429547697305679, + "rewards/margins": 0.05166538804769516, + "rewards/rejected": -0.017369914799928665, + "step": 1230 + }, + { + "epoch": 0.08, + "learning_rate": 4.054937867887509e-06, + "logits/chosen": -2.3860554695129395, + "logits/rejected": -1.9378671646118164, + "logps/chosen": -219.5018310546875, + "logps/rejected": -214.9021759033203, + "loss": 0.6907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04764958843588829, + "rewards/margins": 0.06188775226473808, + "rewards/rejected": -0.014238161034882069, + "step": 1240 + }, + { + "epoch": 0.08, + "learning_rate": 4.087638979725311e-06, + "logits/chosen": -2.3056764602661133, + "logits/rejected": -2.371333599090576, + "logps/chosen": -224.9440155029297, + "logps/rejected": -230.5066375732422, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02895757555961609, + "rewards/margins": 0.04107601195573807, + "rewards/rejected": -0.01211843267083168, + "step": 1250 + }, + { + "epoch": 0.08, + "learning_rate": 4.1203400915631135e-06, + "logits/chosen": -2.221919059753418, + "logits/rejected": -2.146714687347412, + "logps/chosen": -237.0450439453125, + "logps/rejected": -214.48135375976562, + "loss": 0.6933, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.002859347965568304, + "rewards/margins": 0.05717097595334053, + "rewards/rejected": -0.05431162193417549, + "step": 1260 + }, + { + "epoch": 0.08, + "learning_rate": 4.153041203400916e-06, + "logits/chosen": -2.4315924644470215, + "logits/rejected": -2.265227794647217, + "logps/chosen": -259.8785705566406, + "logps/rejected": -239.6594696044922, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04543764516711235, + "rewards/margins": 0.044155552983284, + "rewards/rejected": 0.0012820929987356067, + "step": 1270 + }, + { + "epoch": 0.08, + "learning_rate": 4.185742315238718e-06, + "logits/chosen": -2.343573570251465, + "logits/rejected": -2.0737881660461426, + "logps/chosen": -194.74240112304688, + "logps/rejected": -188.17759704589844, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03359198197722435, + "rewards/margins": 0.05302148312330246, + "rewards/rejected": -0.01942949742078781, + "step": 1280 + }, + { + "epoch": 0.08, + "learning_rate": 4.218443427076521e-06, + "logits/chosen": -2.289780616760254, + "logits/rejected": -2.1402783393859863, + "logps/chosen": -181.81228637695312, + "logps/rejected": -185.71270751953125, + "loss": 0.6926, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00946279987692833, + "rewards/margins": 0.02429373934864998, + "rewards/rejected": -0.014830941334366798, + "step": 1290 + }, + { + "epoch": 0.09, + "learning_rate": 4.251144538914323e-06, + "logits/chosen": -2.2517518997192383, + "logits/rejected": -2.113873243331909, + "logps/chosen": -218.7006378173828, + "logps/rejected": -213.1343536376953, + "loss": 0.6886, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.018896425142884254, + "rewards/margins": 0.07629399746656418, + "rewards/rejected": -0.05739758163690567, + "step": 1300 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -2.3501036167144775, + "eval_logits/rejected": -2.161346673965454, + "eval_logps/chosen": -230.54745483398438, + "eval_logps/rejected": -215.89462280273438, + "eval_loss": 0.6907321810722351, + "eval_rewards/accuracies": 0.6104999780654907, + "eval_rewards/chosen": 0.014575082808732986, + "eval_rewards/margins": 0.05740221589803696, + "eval_rewards/rejected": -0.04282712936401367, + "eval_runtime": 708.7595, + "eval_samples_per_second": 2.822, + "eval_steps_per_second": 1.411, + "step": 1300 + }, + { + "epoch": 0.09, + "learning_rate": 4.283845650752126e-06, + "logits/chosen": -2.379955291748047, + "logits/rejected": -2.1668858528137207, + "logps/chosen": -285.83856201171875, + "logps/rejected": -227.052490234375, + "loss": 0.691, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.003734259633347392, + "rewards/margins": 0.05405404418706894, + "rewards/rejected": -0.05031978338956833, + "step": 1310 + }, + { + "epoch": 0.09, + "learning_rate": 4.316546762589928e-06, + "logits/chosen": -2.3057024478912354, + "logits/rejected": -2.2258567810058594, + "logps/chosen": -207.4925537109375, + "logps/rejected": -178.53604125976562, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004349817987531424, + "rewards/margins": 0.05341663956642151, + "rewards/rejected": -0.04906681925058365, + "step": 1320 + }, + { + "epoch": 0.09, + "learning_rate": 4.349247874427731e-06, + "logits/chosen": -2.3624565601348877, + "logits/rejected": -2.0772993564605713, + "logps/chosen": -228.140869140625, + "logps/rejected": -275.2492980957031, + "loss": 0.6901, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01606837287545204, + "rewards/margins": 0.07025826722383499, + "rewards/rejected": -0.08632663637399673, + "step": 1330 + }, + { + "epoch": 0.09, + "learning_rate": 4.381948986265534e-06, + "logits/chosen": -2.505383253097534, + "logits/rejected": -2.2410616874694824, + "logps/chosen": -262.94122314453125, + "logps/rejected": -259.1203918457031, + "loss": 0.6914, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0009654685854911804, + "rewards/margins": 0.09186152368783951, + "rewards/rejected": -0.0928269773721695, + "step": 1340 + }, + { + "epoch": 0.09, + "learning_rate": 4.414650098103336e-06, + "logits/chosen": -2.652198314666748, + "logits/rejected": -2.3813252449035645, + "logps/chosen": -260.88726806640625, + "logps/rejected": -242.92202758789062, + "loss": 0.6909, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.01065239030867815, + "rewards/margins": 0.04458843916654587, + "rewards/rejected": -0.03393604978919029, + "step": 1350 + }, + { + "epoch": 0.09, + "learning_rate": 4.447351209941138e-06, + "logits/chosen": -2.3565680980682373, + "logits/rejected": -2.0831761360168457, + "logps/chosen": -215.17153930664062, + "logps/rejected": -202.86697387695312, + "loss": 0.6907, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05506874993443489, + "rewards/margins": 0.05836993455886841, + "rewards/rejected": -0.1134386882185936, + "step": 1360 + }, + { + "epoch": 0.09, + "learning_rate": 4.480052321778941e-06, + "logits/chosen": -2.305148124694824, + "logits/rejected": -2.3135273456573486, + "logps/chosen": -226.86849975585938, + "logps/rejected": -212.960205078125, + "loss": 0.6901, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0004178326635155827, + "rewards/margins": 0.06787695735692978, + "rewards/rejected": -0.06745912879705429, + "step": 1370 + }, + { + "epoch": 0.09, + "learning_rate": 4.5127534336167435e-06, + "logits/chosen": -2.3873705863952637, + "logits/rejected": -2.1096668243408203, + "logps/chosen": -252.4434814453125, + "logps/rejected": -214.6434783935547, + "loss": 0.6916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.052694983780384064, + "rewards/margins": 0.058600615710020065, + "rewards/rejected": -0.11129560321569443, + "step": 1380 + }, + { + "epoch": 0.09, + "learning_rate": 4.5454545454545455e-06, + "logits/chosen": -2.356358766555786, + "logits/rejected": -2.299755096435547, + "logps/chosen": -168.05401611328125, + "logps/rejected": -167.7144775390625, + "loss": 0.6907, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.014046875759959221, + "rewards/margins": 0.0986950695514679, + "rewards/rejected": -0.08464818447828293, + "step": 1390 + }, + { + "epoch": 0.09, + "learning_rate": 4.578155657292348e-06, + "logits/chosen": -2.326817274093628, + "logits/rejected": -2.2857398986816406, + "logps/chosen": -253.63827514648438, + "logps/rejected": -270.8973083496094, + "loss": 0.6887, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.017673691734671593, + "rewards/margins": 0.08337118476629257, + "rewards/rejected": -0.06569750607013702, + "step": 1400 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -2.3205173015594482, + "eval_logits/rejected": -2.1350162029266357, + "eval_logps/chosen": -231.281494140625, + "eval_logps/rejected": -217.48509216308594, + "eval_loss": 0.69090336561203, + "eval_rewards/accuracies": 0.6129999756813049, + "eval_rewards/chosen": 0.007234419696033001, + "eval_rewards/margins": 0.06596639752388, + "eval_rewards/rejected": -0.058731988072395325, + "eval_runtime": 710.954, + "eval_samples_per_second": 2.813, + "eval_steps_per_second": 1.407, + "step": 1400 + }, + { + "epoch": 0.09, + "learning_rate": 4.610856769130151e-06, + "logits/chosen": -2.3867039680480957, + "logits/rejected": -2.243751049041748, + "logps/chosen": -238.0634002685547, + "logps/rejected": -215.328857421875, + "loss": 0.6921, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.010808589868247509, + "rewards/margins": 0.06767071038484573, + "rewards/rejected": -0.05686211585998535, + "step": 1410 + }, + { + "epoch": 0.09, + "learning_rate": 4.643557880967953e-06, + "logits/chosen": -2.3865418434143066, + "logits/rejected": -2.1880135536193848, + "logps/chosen": -200.36758422851562, + "logps/rejected": -201.47171020507812, + "loss": 0.6911, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.015687663108110428, + "rewards/margins": 0.03603903204202652, + "rewards/rejected": -0.05172669142484665, + "step": 1420 + }, + { + "epoch": 0.09, + "learning_rate": 4.676258992805755e-06, + "logits/chosen": -2.345919609069824, + "logits/rejected": -2.072216510772705, + "logps/chosen": -267.54669189453125, + "logps/rejected": -233.08798217773438, + "loss": 0.69, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02073250710964203, + "rewards/margins": 0.07689642906188965, + "rewards/rejected": -0.05616391822695732, + "step": 1430 + }, + { + "epoch": 0.09, + "learning_rate": 4.708960104643558e-06, + "logits/chosen": -2.3562963008880615, + "logits/rejected": -2.3137621879577637, + "logps/chosen": -306.24517822265625, + "logps/rejected": -274.78741455078125, + "loss": 0.6907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00035466160625219345, + "rewards/margins": 0.05302319675683975, + "rewards/rejected": -0.05337785556912422, + "step": 1440 + }, + { + "epoch": 0.09, + "learning_rate": 4.741661216481361e-06, + "logits/chosen": -2.3189618587493896, + "logits/rejected": -2.2773196697235107, + "logps/chosen": -237.4014129638672, + "logps/rejected": -258.08966064453125, + "loss": 0.6891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.010558743961155415, + "rewards/margins": 0.09441892802715302, + "rewards/rejected": -0.10497768223285675, + "step": 1450 + }, + { + "epoch": 0.1, + "learning_rate": 4.774362328319163e-06, + "logits/chosen": -2.298546552658081, + "logits/rejected": -2.144091844558716, + "logps/chosen": -217.25430297851562, + "logps/rejected": -191.17282104492188, + "loss": 0.6933, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.014642052352428436, + "rewards/margins": 0.038679804652929306, + "rewards/rejected": -0.05332186073064804, + "step": 1460 + }, + { + "epoch": 0.1, + "learning_rate": 4.807063440156966e-06, + "logits/chosen": -2.33768367767334, + "logits/rejected": -2.1054131984710693, + "logps/chosen": -260.9105529785156, + "logps/rejected": -208.8505401611328, + "loss": 0.6884, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02398931235074997, + "rewards/margins": 0.09310416132211685, + "rewards/rejected": -0.06911484152078629, + "step": 1470 + }, + { + "epoch": 0.1, + "learning_rate": 4.839764551994769e-06, + "logits/chosen": -2.4007508754730225, + "logits/rejected": -2.095411539077759, + "logps/chosen": -246.15673828125, + "logps/rejected": -215.8951873779297, + "loss": 0.6919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.025517677888274193, + "rewards/margins": 0.07472027838230133, + "rewards/rejected": -0.04920259863138199, + "step": 1480 + }, + { + "epoch": 0.1, + "learning_rate": 4.872465663832571e-06, + "logits/chosen": -2.2471413612365723, + "logits/rejected": -2.1273739337921143, + "logps/chosen": -207.810302734375, + "logps/rejected": -194.42510986328125, + "loss": 0.6898, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0005180038278922439, + "rewards/margins": 0.07074950635433197, + "rewards/rejected": -0.0712675154209137, + "step": 1490 + }, + { + "epoch": 0.1, + "learning_rate": 4.905166775670373e-06, + "logits/chosen": -2.372222661972046, + "logits/rejected": -2.034890651702881, + "logps/chosen": -220.8203582763672, + "logps/rejected": -200.08526611328125, + "loss": 0.6887, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.00939613301306963, + "rewards/margins": 0.06658481061458588, + "rewards/rejected": -0.057188671082258224, + "step": 1500 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.321709394454956, + "eval_logits/rejected": -2.1364989280700684, + "eval_logps/chosen": -233.14047241210938, + "eval_logps/rejected": -220.0596923828125, + "eval_loss": 0.6906663179397583, + "eval_rewards/accuracies": 0.6305000185966492, + "eval_rewards/chosen": -0.011355271562933922, + "eval_rewards/margins": 0.07312270253896713, + "eval_rewards/rejected": -0.0844779685139656, + "eval_runtime": 714.039, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.4, + "step": 1500 + }, + { + "epoch": 0.1, + "learning_rate": 4.9378678875081756e-06, + "logits/chosen": -2.4643445014953613, + "logits/rejected": -2.1634469032287598, + "logps/chosen": -214.7722625732422, + "logps/rejected": -191.8986358642578, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.000286552298348397, + "rewards/margins": 0.08134503662586212, + "rewards/rejected": -0.081631600856781, + "step": 1510 + }, + { + "epoch": 0.1, + "learning_rate": 4.9705689993459784e-06, + "logits/chosen": -2.4508216381073, + "logits/rejected": -2.0175795555114746, + "logps/chosen": -207.11288452148438, + "logps/rejected": -161.23849487304688, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021875491365790367, + "rewards/margins": 0.11838686466217041, + "rewards/rejected": -0.14026235044002533, + "step": 1520 + }, + { + "epoch": 0.1, + "learning_rate": 4.999999934793849e-06, + "logits/chosen": -2.350825309753418, + "logits/rejected": -2.257450580596924, + "logps/chosen": -248.128173828125, + "logps/rejected": -223.34884643554688, + "loss": 0.6916, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0017488065641373396, + "rewards/margins": 0.04637077450752258, + "rewards/rejected": -0.04811957851052284, + "step": 1530 + }, + { + "epoch": 0.1, + "learning_rate": 4.999992110059814e-06, + "logits/chosen": -2.321943759918213, + "logits/rejected": -2.3055014610290527, + "logps/chosen": -277.02490234375, + "logps/rejected": -263.4502258300781, + "loss": 0.6915, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.00662571843713522, + "rewards/margins": 0.06538694351911545, + "rewards/rejected": -0.058761220425367355, + "step": 1540 + }, + { + "epoch": 0.1, + "learning_rate": 4.999971244142299e-06, + "logits/chosen": -2.4726457595825195, + "logits/rejected": -2.1803183555603027, + "logps/chosen": -269.8371276855469, + "logps/rejected": -237.511962890625, + "loss": 0.6918, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03615967929363251, + "rewards/margins": 0.06921084225177765, + "rewards/rejected": -0.03305116295814514, + "step": 1550 + }, + { + "epoch": 0.1, + "learning_rate": 4.999937337150149e-06, + "logits/chosen": -2.155517578125, + "logits/rejected": -2.115971803665161, + "logps/chosen": -232.05459594726562, + "logps/rejected": -219.97708129882812, + "loss": 0.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.024602752178907394, + "rewards/margins": 0.03303222730755806, + "rewards/rejected": -0.008429473266005516, + "step": 1560 + }, + { + "epoch": 0.1, + "learning_rate": 4.99989038926024e-06, + "logits/chosen": -2.11495304107666, + "logits/rejected": -2.245858669281006, + "logps/chosen": -199.39358520507812, + "logps/rejected": -206.3715057373047, + "loss": 0.6895, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009655221365392208, + "rewards/margins": 0.047758929431438446, + "rewards/rejected": -0.05741415172815323, + "step": 1570 + }, + { + "epoch": 0.1, + "learning_rate": 4.999830400717476e-06, + "logits/chosen": -2.3075220584869385, + "logits/rejected": -2.172712802886963, + "logps/chosen": -287.1651306152344, + "logps/rejected": -280.3276672363281, + "loss": 0.6904, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0022194921039044857, + "rewards/margins": 0.0943801999092102, + "rewards/rejected": -0.09216071665287018, + "step": 1580 + }, + { + "epoch": 0.1, + "learning_rate": 4.999757371834787e-06, + "logits/chosen": -2.1875932216644287, + "logits/rejected": -2.094125747680664, + "logps/chosen": -241.1585693359375, + "logps/rejected": -238.26773071289062, + "loss": 0.6885, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.013396786525845528, + "rewards/margins": 0.126246377825737, + "rewards/rejected": -0.13964316248893738, + "step": 1590 + }, + { + "epoch": 0.1, + "learning_rate": 4.999671302993125e-06, + "logits/chosen": -2.134300470352173, + "logits/rejected": -2.071805477142334, + "logps/chosen": -248.94729614257812, + "logps/rejected": -272.7432861328125, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.003668667282909155, + "rewards/margins": 0.08407886326313019, + "rewards/rejected": -0.08774752914905548, + "step": 1600 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.327033042907715, + "eval_logits/rejected": -2.141350269317627, + "eval_logps/chosen": -227.57763671875, + "eval_logps/rejected": -214.50515747070312, + "eval_loss": 0.6906238794326782, + "eval_rewards/accuracies": 0.6259999871253967, + "eval_rewards/chosen": 0.04427312687039375, + "eval_rewards/margins": 0.07320577651262283, + "eval_rewards/rejected": -0.028932644054293633, + "eval_runtime": 706.6876, + "eval_samples_per_second": 2.83, + "eval_steps_per_second": 1.415, + "step": 1600 + }, + { + "epoch": 0.11, + "learning_rate": 4.999572194641471e-06, + "logits/chosen": -2.291485071182251, + "logits/rejected": -2.1361746788024902, + "logps/chosen": -271.530029296875, + "logps/rejected": -228.67013549804688, + "loss": 0.6888, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03988269716501236, + "rewards/margins": 0.09610097855329514, + "rewards/rejected": -0.056218285113573074, + "step": 1610 + }, + { + "epoch": 0.11, + "learning_rate": 4.999460047296819e-06, + "logits/chosen": -2.2640249729156494, + "logits/rejected": -2.134519577026367, + "logps/chosen": -213.858154296875, + "logps/rejected": -200.5656280517578, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01746007241308689, + "rewards/margins": 0.07616613805294037, + "rewards/rejected": -0.05870606750249863, + "step": 1620 + }, + { + "epoch": 0.11, + "learning_rate": 4.999334861544186e-06, + "logits/chosen": -2.381791353225708, + "logits/rejected": -2.045926570892334, + "logps/chosen": -226.28970336914062, + "logps/rejected": -178.6829833984375, + "loss": 0.6912, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.031635530292987823, + "rewards/margins": 0.0839659571647644, + "rewards/rejected": -0.05233042314648628, + "step": 1630 + }, + { + "epoch": 0.11, + "learning_rate": 4.999196638036604e-06, + "logits/chosen": -2.4527511596679688, + "logits/rejected": -2.2551088333129883, + "logps/chosen": -281.4839782714844, + "logps/rejected": -249.938232421875, + "loss": 0.6927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05931050330400467, + "rewards/margins": 0.03690633550286293, + "rewards/rejected": 0.02240416780114174, + "step": 1640 + }, + { + "epoch": 0.11, + "learning_rate": 4.999045377495111e-06, + "logits/chosen": -2.117143154144287, + "logits/rejected": -2.3743062019348145, + "logps/chosen": -170.4903106689453, + "logps/rejected": -265.4920654296875, + "loss": 0.6897, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03844357654452324, + "rewards/margins": 0.07219245284795761, + "rewards/rejected": -0.033748872578144073, + "step": 1650 + }, + { + "epoch": 0.11, + "learning_rate": 4.998881080708759e-06, + "logits/chosen": -2.2666547298431396, + "logits/rejected": -2.1885650157928467, + "logps/chosen": -239.72262573242188, + "logps/rejected": -212.6611328125, + "loss": 0.6933, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00598920788615942, + "rewards/margins": 0.03883373737335205, + "rewards/rejected": -0.03284453600645065, + "step": 1660 + }, + { + "epoch": 0.11, + "learning_rate": 4.998703748534599e-06, + "logits/chosen": -2.0997233390808105, + "logits/rejected": -1.8399873971939087, + "logps/chosen": -240.05831909179688, + "logps/rejected": -196.15187072753906, + "loss": 0.6917, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.005714725703001022, + "rewards/margins": 0.06784109026193619, + "rewards/rejected": -0.062126368284225464, + "step": 1670 + }, + { + "epoch": 0.11, + "learning_rate": 4.998513381897683e-06, + "logits/chosen": -2.338465452194214, + "logits/rejected": -2.107517719268799, + "logps/chosen": -237.6542205810547, + "logps/rejected": -185.0458984375, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.015895305201411247, + "rewards/margins": 0.04385993629693985, + "rewards/rejected": -0.027964631095528603, + "step": 1680 + }, + { + "epoch": 0.11, + "learning_rate": 4.9983099817910565e-06, + "logits/chosen": -2.259268283843994, + "logits/rejected": -2.0765738487243652, + "logps/chosen": -243.1163330078125, + "logps/rejected": -246.34951782226562, + "loss": 0.6902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02145574428141117, + "rewards/margins": 0.07711198180913925, + "rewards/rejected": -0.05565624311566353, + "step": 1690 + }, + { + "epoch": 0.11, + "learning_rate": 4.998093549275754e-06, + "logits/chosen": -2.2384533882141113, + "logits/rejected": -2.2348380088806152, + "logps/chosen": -258.17047119140625, + "logps/rejected": -287.0838317871094, + "loss": 0.6893, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.02163517475128174, + "rewards/margins": 0.07120385020971298, + "rewards/rejected": -0.049568675458431244, + "step": 1700 + }, + { + "epoch": 0.11, + "eval_logits/chosen": -2.3420772552490234, + "eval_logits/rejected": -2.1548047065734863, + "eval_logps/chosen": -228.6732940673828, + "eval_logps/rejected": -215.70216369628906, + "eval_loss": 0.6903870701789856, + "eval_rewards/accuracies": 0.6215000152587891, + "eval_rewards/chosen": 0.033316612243652344, + "eval_rewards/margins": 0.07421907782554626, + "eval_rewards/rejected": -0.04090247303247452, + "eval_runtime": 706.8309, + "eval_samples_per_second": 2.83, + "eval_steps_per_second": 1.415, + "step": 1700 + }, + { + "epoch": 0.11, + "learning_rate": 4.997864085480794e-06, + "logits/chosen": -2.372429370880127, + "logits/rejected": -2.201474189758301, + "logps/chosen": -263.83319091796875, + "logps/rejected": -253.3795166015625, + "loss": 0.6936, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.025129973888397217, + "rewards/margins": 0.08083225041627884, + "rewards/rejected": -0.05570227652788162, + "step": 1710 + }, + { + "epoch": 0.11, + "learning_rate": 4.997621591603171e-06, + "logits/chosen": -2.3386642932891846, + "logits/rejected": -2.1461567878723145, + "logps/chosen": -160.34231567382812, + "logps/rejected": -167.763916015625, + "loss": 0.6917, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023095201700925827, + "rewards/margins": 0.08106372505426407, + "rewards/rejected": -0.1041589230298996, + "step": 1720 + }, + { + "epoch": 0.11, + "learning_rate": 4.997366068907853e-06, + "logits/chosen": -2.3146162033081055, + "logits/rejected": -2.260533094406128, + "logps/chosen": -257.2478942871094, + "logps/rejected": -243.188232421875, + "loss": 0.6901, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.011618571355938911, + "rewards/margins": 0.05815444514155388, + "rewards/rejected": -0.04653587192296982, + "step": 1730 + }, + { + "epoch": 0.11, + "learning_rate": 4.997097518727771e-06, + "logits/chosen": -2.366065263748169, + "logits/rejected": -2.1315550804138184, + "logps/chosen": -224.10079956054688, + "logps/rejected": -197.63809204101562, + "loss": 0.6901, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.011443842202425003, + "rewards/margins": 0.06659694015979767, + "rewards/rejected": -0.07804077863693237, + "step": 1740 + }, + { + "epoch": 0.11, + "learning_rate": 4.9968159424638155e-06, + "logits/chosen": -2.2000982761383057, + "logits/rejected": -2.418351650238037, + "logps/chosen": -218.63314819335938, + "logps/rejected": -273.26837158203125, + "loss": 0.691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.010267667472362518, + "rewards/margins": 0.02790077030658722, + "rewards/rejected": -0.03816843777894974, + "step": 1750 + }, + { + "epoch": 0.12, + "learning_rate": 4.9965213415848235e-06, + "logits/chosen": -2.284058094024658, + "logits/rejected": -1.9068619012832642, + "logps/chosen": -231.62319946289062, + "logps/rejected": -201.48184204101562, + "loss": 0.6907, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03218645602464676, + "rewards/margins": 0.06718595325946808, + "rewards/rejected": -0.09937240928411484, + "step": 1760 + }, + { + "epoch": 0.12, + "learning_rate": 4.9962137176275805e-06, + "logits/chosen": -2.3636927604675293, + "logits/rejected": -2.164499044418335, + "logps/chosen": -232.9027862548828, + "logps/rejected": -236.94198608398438, + "loss": 0.6934, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.013926369138062, + "rewards/margins": 0.04597216844558716, + "rewards/rejected": -0.03204580023884773, + "step": 1770 + }, + { + "epoch": 0.12, + "learning_rate": 4.9958930721968015e-06, + "logits/chosen": -2.2207086086273193, + "logits/rejected": -2.3023934364318848, + "logps/chosen": -210.31875610351562, + "logps/rejected": -226.93685913085938, + "loss": 0.692, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.017164241522550583, + "rewards/margins": 0.0511639229953289, + "rewards/rejected": -0.03399968519806862, + "step": 1780 + }, + { + "epoch": 0.12, + "learning_rate": 4.995559406965132e-06, + "logits/chosen": -2.4438416957855225, + "logits/rejected": -2.0871827602386475, + "logps/chosen": -229.1040802001953, + "logps/rejected": -204.32778930664062, + "loss": 0.6912, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02811221405863762, + "rewards/margins": 0.04839346557855606, + "rewards/rejected": -0.02028125710785389, + "step": 1790 + }, + { + "epoch": 0.12, + "learning_rate": 4.995212723673131e-06, + "logits/chosen": -2.408142566680908, + "logits/rejected": -2.172886848449707, + "logps/chosen": -225.9892578125, + "logps/rejected": -183.36489868164062, + "loss": 0.6904, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04213835671544075, + "rewards/margins": 0.07358547300100327, + "rewards/rejected": -0.031447120010852814, + "step": 1800 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -2.3330607414245605, + "eval_logits/rejected": -2.146653652191162, + "eval_logps/chosen": -227.91104125976562, + "eval_logps/rejected": -213.03692626953125, + "eval_loss": 0.6908692717552185, + "eval_rewards/accuracies": 0.6159999966621399, + "eval_rewards/chosen": 0.040939364582300186, + "eval_rewards/margins": 0.055189553648233414, + "eval_rewards/rejected": -0.014250185340642929, + "eval_runtime": 706.2668, + "eval_samples_per_second": 2.832, + "eval_steps_per_second": 1.416, + "step": 1800 + }, + { + "epoch": 0.12, + "learning_rate": 4.99485302412927e-06, + "logits/chosen": -2.090059280395508, + "logits/rejected": -2.0199811458587646, + "logps/chosen": -204.09255981445312, + "logps/rejected": -211.0037078857422, + "loss": 0.6897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.039184752851724625, + "rewards/margins": 0.06218431517481804, + "rewards/rejected": -0.022999566048383713, + "step": 1810 + }, + { + "epoch": 0.12, + "learning_rate": 4.994480310209918e-06, + "logits/chosen": -2.297668933868408, + "logits/rejected": -2.459190845489502, + "logps/chosen": -235.6026611328125, + "logps/rejected": -254.4353485107422, + "loss": 0.6912, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0638035237789154, + "rewards/margins": 0.05078417807817459, + "rewards/rejected": 0.013019341044127941, + "step": 1820 + }, + { + "epoch": 0.12, + "learning_rate": 4.994094583859332e-06, + "logits/chosen": -2.326498031616211, + "logits/rejected": -2.0701937675476074, + "logps/chosen": -157.9686737060547, + "logps/rejected": -194.639892578125, + "loss": 0.6895, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.069440558552742, + "rewards/margins": 0.0464170016348362, + "rewards/rejected": 0.023023560643196106, + "step": 1830 + }, + { + "epoch": 0.12, + "learning_rate": 4.9936958470896525e-06, + "logits/chosen": -2.334303617477417, + "logits/rejected": -2.089700698852539, + "logps/chosen": -221.90780639648438, + "logps/rejected": -187.50491333007812, + "loss": 0.6886, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.018373683094978333, + "rewards/margins": 0.08500251919031143, + "rewards/rejected": -0.0666288360953331, + "step": 1840 + }, + { + "epoch": 0.12, + "learning_rate": 4.993284101980883e-06, + "logits/chosen": -2.2909703254699707, + "logits/rejected": -2.0997474193573, + "logps/chosen": -244.8101348876953, + "logps/rejected": -217.73196411132812, + "loss": 0.6859, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.023147549480199814, + "rewards/margins": 0.12554757297039032, + "rewards/rejected": -0.14869512617588043, + "step": 1850 + }, + { + "epoch": 0.12, + "learning_rate": 4.9928593506808885e-06, + "logits/chosen": -2.40543794631958, + "logits/rejected": -2.2108352184295654, + "logps/chosen": -258.2170715332031, + "logps/rejected": -228.532958984375, + "loss": 0.6916, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.014026492834091187, + "rewards/margins": 0.07293161749839783, + "rewards/rejected": -0.08695811778306961, + "step": 1860 + }, + { + "epoch": 0.12, + "learning_rate": 4.992421595405381e-06, + "logits/chosen": -2.3434250354766846, + "logits/rejected": -2.095716714859009, + "logps/chosen": -224.97021484375, + "logps/rejected": -158.5146942138672, + "loss": 0.6933, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.03222180902957916, + "rewards/margins": 0.04391375184059143, + "rewards/rejected": -0.011691942811012268, + "step": 1870 + }, + { + "epoch": 0.12, + "learning_rate": 4.991970838437905e-06, + "logits/chosen": -2.2580971717834473, + "logits/rejected": -2.1631343364715576, + "logps/chosen": -220.3092803955078, + "logps/rejected": -255.4618377685547, + "loss": 0.6898, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0502026192843914, + "rewards/margins": 0.07047822326421738, + "rewards/rejected": -0.020275603979825974, + "step": 1880 + }, + { + "epoch": 0.12, + "learning_rate": 4.9915070821298294e-06, + "logits/chosen": -2.366753339767456, + "logits/rejected": -2.039536237716675, + "logps/chosen": -162.96090698242188, + "logps/rejected": -157.37863159179688, + "loss": 0.6913, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04008902981877327, + "rewards/margins": 0.03506173565983772, + "rewards/rejected": 0.0050272950902581215, + "step": 1890 + }, + { + "epoch": 0.12, + "learning_rate": 4.991030328900336e-06, + "logits/chosen": -2.295825481414795, + "logits/rejected": -2.040485143661499, + "logps/chosen": -278.2053527832031, + "logps/rejected": -216.57290649414062, + "loss": 0.6908, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.06363843381404877, + "rewards/margins": 0.08403732627630234, + "rewards/rejected": -0.02039889432489872, + "step": 1900 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -2.3460586071014404, + "eval_logits/rejected": -2.158717393875122, + "eval_logps/chosen": -227.45773315429688, + "eval_logps/rejected": -213.3264617919922, + "eval_loss": 0.6906041502952576, + "eval_rewards/accuracies": 0.6290000081062317, + "eval_rewards/chosen": 0.04547214135527611, + "eval_rewards/margins": 0.06261760741472244, + "eval_rewards/rejected": -0.017145469784736633, + "eval_runtime": 707.9289, + "eval_samples_per_second": 2.825, + "eval_steps_per_second": 1.413, + "step": 1900 + }, + { + "epoch": 0.12, + "learning_rate": 4.9905405812364014e-06, + "logits/chosen": -2.3040223121643066, + "logits/rejected": -2.288963794708252, + "logps/chosen": -196.22824096679688, + "logps/rejected": -204.9025115966797, + "loss": 0.6915, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.06067962199449539, + "rewards/margins": 0.062089789658784866, + "rewards/rejected": -0.0014101641718298197, + "step": 1910 + }, + { + "epoch": 0.13, + "learning_rate": 4.990037841692791e-06, + "logits/chosen": -2.2711470127105713, + "logits/rejected": -2.027571201324463, + "logps/chosen": -203.07684326171875, + "logps/rejected": -167.22711181640625, + "loss": 0.6901, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02964043617248535, + "rewards/margins": 0.0743485689163208, + "rewards/rejected": -0.04470812529325485, + "step": 1920 + }, + { + "epoch": 0.13, + "learning_rate": 4.989522112892039e-06, + "logits/chosen": -2.3281807899475098, + "logits/rejected": -2.2966067790985107, + "logps/chosen": -198.56414794921875, + "logps/rejected": -204.41529846191406, + "loss": 0.6901, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.009347607381641865, + "rewards/margins": 0.06741191446781158, + "rewards/rejected": -0.07675951719284058, + "step": 1930 + }, + { + "epoch": 0.13, + "learning_rate": 4.98899339752444e-06, + "logits/chosen": -2.370422124862671, + "logits/rejected": -2.1254730224609375, + "logps/chosen": -227.0062255859375, + "logps/rejected": -208.43771362304688, + "loss": 0.688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.025960898026823997, + "rewards/margins": 0.10374845564365387, + "rewards/rejected": -0.07778755575418472, + "step": 1940 + }, + { + "epoch": 0.13, + "learning_rate": 4.988451698348033e-06, + "logits/chosen": -2.287224769592285, + "logits/rejected": -2.2654972076416016, + "logps/chosen": -177.00662231445312, + "logps/rejected": -202.26292419433594, + "loss": 0.6936, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.013209563679993153, + "rewards/margins": 0.04986957833170891, + "rewards/rejected": -0.03666001558303833, + "step": 1950 + }, + { + "epoch": 0.13, + "learning_rate": 4.987897018188585e-06, + "logits/chosen": -2.263166904449463, + "logits/rejected": -2.024923801422119, + "logps/chosen": -222.4883270263672, + "logps/rejected": -171.92982482910156, + "loss": 0.6919, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0013100657379254699, + "rewards/margins": 0.061175812035799026, + "rewards/rejected": -0.05986575409770012, + "step": 1960 + }, + { + "epoch": 0.13, + "learning_rate": 4.9873293599395814e-06, + "logits/chosen": -2.2896173000335693, + "logits/rejected": -2.1597745418548584, + "logps/chosen": -195.554931640625, + "logps/rejected": -200.31951904296875, + "loss": 0.6876, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02123176120221615, + "rewards/margins": 0.08748042583465576, + "rewards/rejected": -0.10871219635009766, + "step": 1970 + }, + { + "epoch": 0.13, + "learning_rate": 4.986748726562203e-06, + "logits/chosen": -2.3323073387145996, + "logits/rejected": -2.2020726203918457, + "logps/chosen": -216.76565551757812, + "logps/rejected": -197.0693359375, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009111806750297546, + "rewards/margins": 0.05196034908294678, + "rewards/rejected": -0.061072152107954025, + "step": 1980 + }, + { + "epoch": 0.13, + "learning_rate": 4.98615512108532e-06, + "logits/chosen": -2.42622709274292, + "logits/rejected": -2.2754573822021484, + "logps/chosen": -218.6569061279297, + "logps/rejected": -224.9327850341797, + "loss": 0.6912, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.028538372367620468, + "rewards/margins": 0.044925060123205185, + "rewards/rejected": -0.016386687755584717, + "step": 1990 + }, + { + "epoch": 0.13, + "learning_rate": 4.985548546605469e-06, + "logits/chosen": -2.1802406311035156, + "logits/rejected": -2.314124584197998, + "logps/chosen": -215.97323608398438, + "logps/rejected": -237.9573974609375, + "loss": 0.6907, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.028674548491835594, + "rewards/margins": 0.04343647509813309, + "rewards/rejected": -0.07211102545261383, + "step": 2000 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -2.355757713317871, + "eval_logits/rejected": -2.16721248626709, + "eval_logps/chosen": -232.93431091308594, + "eval_logps/rejected": -220.59490966796875, + "eval_loss": 0.6904016137123108, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": -0.009293550625443459, + "eval_rewards/margins": 0.08053648471832275, + "eval_rewards/rejected": -0.08983004093170166, + "eval_runtime": 709.7269, + "eval_samples_per_second": 2.818, + "eval_steps_per_second": 1.409, + "step": 2000 + }, + { + "epoch": 0.13, + "learning_rate": 4.984929006286838e-06, + "logits/chosen": -2.2016310691833496, + "logits/rejected": -2.152900457382202, + "logps/chosen": -211.8243865966797, + "logps/rejected": -217.55712890625, + "loss": 0.6935, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.025103915482759476, + "rewards/margins": 0.022764649242162704, + "rewards/rejected": -0.04786856472492218, + "step": 2010 + }, + { + "epoch": 0.13, + "learning_rate": 4.984296503361256e-06, + "logits/chosen": -2.4327239990234375, + "logits/rejected": -2.072183132171631, + "logps/chosen": -202.29269409179688, + "logps/rejected": -166.57730102539062, + "loss": 0.6914, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0037281368859112263, + "rewards/margins": 0.06288442015647888, + "rewards/rejected": -0.06661255657672882, + "step": 2020 + }, + { + "epoch": 0.13, + "learning_rate": 4.9836510411281645e-06, + "logits/chosen": -2.2853875160217285, + "logits/rejected": -2.1659188270568848, + "logps/chosen": -278.9981384277344, + "logps/rejected": -251.56393432617188, + "loss": 0.6867, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.004663098603487015, + "rewards/margins": 0.1196819394826889, + "rewards/rejected": -0.12434504926204681, + "step": 2030 + }, + { + "epoch": 0.13, + "learning_rate": 4.982992622954613e-06, + "logits/chosen": -2.422229766845703, + "logits/rejected": -2.1254818439483643, + "logps/chosen": -282.1938171386719, + "logps/rejected": -176.20748901367188, + "loss": 0.6885, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.013343462720513344, + "rewards/margins": 0.08182214200496674, + "rewards/rejected": -0.06847867369651794, + "step": 2040 + }, + { + "epoch": 0.13, + "learning_rate": 4.9823212522752325e-06, + "logits/chosen": -2.530290126800537, + "logits/rejected": -2.2488186359405518, + "logps/chosen": -275.6991882324219, + "logps/rejected": -252.4962615966797, + "loss": 0.6881, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.02114512026309967, + "rewards/margins": 0.10833090543746948, + "rewards/rejected": -0.08718578517436981, + "step": 2050 + }, + { + "epoch": 0.13, + "learning_rate": 4.981636932592222e-06, + "logits/chosen": -2.287421703338623, + "logits/rejected": -2.181699275970459, + "logps/chosen": -207.735107421875, + "logps/rejected": -210.69009399414062, + "loss": 0.6916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.05177903175354004, + "rewards/margins": 0.0697595402598381, + "rewards/rejected": -0.017980504781007767, + "step": 2060 + }, + { + "epoch": 0.14, + "learning_rate": 4.980939667475328e-06, + "logits/chosen": -2.505133628845215, + "logits/rejected": -2.146200656890869, + "logps/chosen": -268.6704406738281, + "logps/rejected": -213.940673828125, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05504737049341202, + "rewards/margins": 0.0586230531334877, + "rewards/rejected": -0.0035756707657128572, + "step": 2070 + }, + { + "epoch": 0.14, + "learning_rate": 4.980229460561826e-06, + "logits/chosen": -2.353895664215088, + "logits/rejected": -2.2923059463500977, + "logps/chosen": -214.421630859375, + "logps/rejected": -210.04910278320312, + "loss": 0.6896, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03659746050834656, + "rewards/margins": 0.10933919250965118, + "rewards/rejected": -0.07274172455072403, + "step": 2080 + }, + { + "epoch": 0.14, + "learning_rate": 4.979506315556503e-06, + "logits/chosen": -2.316563129425049, + "logits/rejected": -1.9798656702041626, + "logps/chosen": -284.0970153808594, + "logps/rejected": -241.884521484375, + "loss": 0.6907, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.023951154202222824, + "rewards/margins": 0.08775506168603897, + "rewards/rejected": -0.06380391120910645, + "step": 2090 + }, + { + "epoch": 0.14, + "learning_rate": 4.9787702362316395e-06, + "logits/chosen": -2.36979079246521, + "logits/rejected": -2.592101812362671, + "logps/chosen": -188.11888122558594, + "logps/rejected": -218.1317901611328, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.027083251625299454, + "rewards/margins": 0.04498888552188873, + "rewards/rejected": -0.01790563203394413, + "step": 2100 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -2.3738138675689697, + "eval_logits/rejected": -2.1836633682250977, + "eval_logps/chosen": -229.5577850341797, + "eval_logps/rejected": -215.9217529296875, + "eval_loss": 0.6904562711715698, + "eval_rewards/accuracies": 0.6380000114440918, + "eval_rewards/chosen": 0.024471644312143326, + "eval_rewards/margins": 0.06757022440433502, + "eval_rewards/rejected": -0.04309859126806259, + "eval_runtime": 710.6569, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 2100 + }, + { + "epoch": 0.14, + "learning_rate": 4.9780212264269835e-06, + "logits/chosen": -2.3147263526916504, + "logits/rejected": -2.065084934234619, + "logps/chosen": -182.92022705078125, + "logps/rejected": -169.31704711914062, + "loss": 0.6912, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.01898839697241783, + "rewards/margins": 0.03532510995864868, + "rewards/rejected": -0.01633671671152115, + "step": 2110 + }, + { + "epoch": 0.14, + "learning_rate": 4.977259290049739e-06, + "logits/chosen": -2.568851947784424, + "logits/rejected": -1.9942678213119507, + "logps/chosen": -281.524658203125, + "logps/rejected": -226.5505828857422, + "loss": 0.6856, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.05417801812291145, + "rewards/margins": 0.14544400572776794, + "rewards/rejected": -0.0912659615278244, + "step": 2120 + }, + { + "epoch": 0.14, + "learning_rate": 4.976484431074538e-06, + "logits/chosen": -2.3134965896606445, + "logits/rejected": -2.251204252243042, + "logps/chosen": -191.16305541992188, + "logps/rejected": -178.55044555664062, + "loss": 0.692, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.026424426585435867, + "rewards/margins": 0.04989726096391678, + "rewards/rejected": -0.02347283437848091, + "step": 2130 + }, + { + "epoch": 0.14, + "learning_rate": 4.975696653543425e-06, + "logits/chosen": -2.399449586868286, + "logits/rejected": -2.1439669132232666, + "logps/chosen": -248.66311645507812, + "logps/rejected": -245.24496459960938, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009857301600277424, + "rewards/margins": 0.09151118248701096, + "rewards/rejected": -0.08165387064218521, + "step": 2140 + }, + { + "epoch": 0.14, + "learning_rate": 4.974895961565835e-06, + "logits/chosen": -2.325570583343506, + "logits/rejected": -1.9811022281646729, + "logps/chosen": -174.66339111328125, + "logps/rejected": -187.84197998046875, + "loss": 0.6885, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005875427275896072, + "rewards/margins": 0.07950626313686371, + "rewards/rejected": -0.07363083958625793, + "step": 2150 + }, + { + "epoch": 0.14, + "learning_rate": 4.974082359318566e-06, + "logits/chosen": -2.320629596710205, + "logits/rejected": -2.144028902053833, + "logps/chosen": -252.28860473632812, + "logps/rejected": -216.6935577392578, + "loss": 0.6905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.013614351861178875, + "rewards/margins": 0.09884041547775269, + "rewards/rejected": -0.08522607386112213, + "step": 2160 + }, + { + "epoch": 0.14, + "learning_rate": 4.973255851045769e-06, + "logits/chosen": -2.2949633598327637, + "logits/rejected": -2.3036093711853027, + "logps/chosen": -215.01193237304688, + "logps/rejected": -185.1837158203125, + "loss": 0.6907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.023462986573576927, + "rewards/margins": 0.06919287145137787, + "rewards/rejected": -0.04572988301515579, + "step": 2170 + }, + { + "epoch": 0.14, + "learning_rate": 4.972416441058915e-06, + "logits/chosen": -2.2384517192840576, + "logits/rejected": -2.0826570987701416, + "logps/chosen": -227.3652801513672, + "logps/rejected": -211.1637420654297, + "loss": 0.6888, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.023905407637357712, + "rewards/margins": 0.10440375655889511, + "rewards/rejected": -0.0804983526468277, + "step": 2180 + }, + { + "epoch": 0.14, + "learning_rate": 4.971564133736777e-06, + "logits/chosen": -2.1866540908813477, + "logits/rejected": -2.0094194412231445, + "logps/chosen": -174.2550506591797, + "logps/rejected": -188.44947814941406, + "loss": 0.6874, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.037455908954143524, + "rewards/margins": 0.09213308990001678, + "rewards/rejected": -0.05467717722058296, + "step": 2190 + }, + { + "epoch": 0.14, + "learning_rate": 4.970698933525409e-06, + "logits/chosen": -2.478977918624878, + "logits/rejected": -2.19539737701416, + "logps/chosen": -281.2948303222656, + "logps/rejected": -250.3472900390625, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01947910524904728, + "rewards/margins": 0.08724024891853333, + "rewards/rejected": -0.10671935975551605, + "step": 2200 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -2.3566019535064697, + "eval_logits/rejected": -2.166872262954712, + "eval_logps/chosen": -234.11630249023438, + "eval_logps/rejected": -221.84378051757812, + "eval_loss": 0.690406084060669, + "eval_rewards/accuracies": 0.6259999871253967, + "eval_rewards/chosen": -0.021113485097885132, + "eval_rewards/margins": 0.08120539039373398, + "eval_rewards/rejected": -0.10231887549161911, + "eval_runtime": 713.5703, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.401, + "step": 2200 + }, + { + "epoch": 0.14, + "learning_rate": 4.969820844938118e-06, + "logits/chosen": -2.4681389331817627, + "logits/rejected": -2.141389846801758, + "logps/chosen": -223.8988800048828, + "logps/rejected": -179.49386596679688, + "loss": 0.6895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.012956234626471996, + "rewards/margins": 0.0949764996767044, + "rewards/rejected": -0.10793273150920868, + "step": 2210 + }, + { + "epoch": 0.15, + "learning_rate": 4.968929872555444e-06, + "logits/chosen": -1.9914073944091797, + "logits/rejected": -2.178544521331787, + "logps/chosen": -223.1538543701172, + "logps/rejected": -262.5058898925781, + "loss": 0.691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.06019466370344162, + "rewards/margins": 0.03688964992761612, + "rewards/rejected": -0.09708431363105774, + "step": 2220 + }, + { + "epoch": 0.15, + "learning_rate": 4.968026021025137e-06, + "logits/chosen": -2.4021904468536377, + "logits/rejected": -2.184993267059326, + "logps/chosen": -201.25662231445312, + "logps/rejected": -168.76951599121094, + "loss": 0.6889, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0030830535106360912, + "rewards/margins": 0.07578281313180923, + "rewards/rejected": -0.07269976288080215, + "step": 2230 + }, + { + "epoch": 0.15, + "learning_rate": 4.967109295062128e-06, + "logits/chosen": -2.2592692375183105, + "logits/rejected": -2.035545825958252, + "logps/chosen": -223.9074249267578, + "logps/rejected": -253.28909301757812, + "loss": 0.6921, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010940281674265862, + "rewards/margins": 0.08460705727338791, + "rewards/rejected": -0.0736667662858963, + "step": 2240 + }, + { + "epoch": 0.15, + "learning_rate": 4.966179699448509e-06, + "logits/chosen": -2.2413432598114014, + "logits/rejected": -2.0473268032073975, + "logps/chosen": -191.69692993164062, + "logps/rejected": -176.70880126953125, + "loss": 0.6916, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.009419666603207588, + "rewards/margins": 0.02837999537587166, + "rewards/rejected": -0.037799663841724396, + "step": 2250 + }, + { + "epoch": 0.15, + "learning_rate": 4.965237239033506e-06, + "logits/chosen": -2.410356044769287, + "logits/rejected": -2.2265586853027344, + "logps/chosen": -286.68280029296875, + "logps/rejected": -257.287841796875, + "loss": 0.6858, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.029759634286165237, + "rewards/margins": 0.1538696587085724, + "rewards/rejected": -0.12411002069711685, + "step": 2260 + }, + { + "epoch": 0.15, + "learning_rate": 4.964281918733453e-06, + "logits/chosen": -2.3516123294830322, + "logits/rejected": -2.0975680351257324, + "logps/chosen": -186.3314666748047, + "logps/rejected": -193.1648406982422, + "loss": 0.6882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03217019513249397, + "rewards/margins": 0.11290119588375092, + "rewards/rejected": -0.14507140219211578, + "step": 2270 + }, + { + "epoch": 0.15, + "learning_rate": 4.9633137435317715e-06, + "logits/chosen": -2.367588758468628, + "logits/rejected": -1.7204262018203735, + "logps/chosen": -223.7427978515625, + "logps/rejected": -165.84214782714844, + "loss": 0.6881, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.008477909490466118, + "rewards/margins": 0.09884487092494965, + "rewards/rejected": -0.10732278972864151, + "step": 2280 + }, + { + "epoch": 0.15, + "learning_rate": 4.9623327184789355e-06, + "logits/chosen": -2.464026689529419, + "logits/rejected": -2.3778140544891357, + "logps/chosen": -216.4389190673828, + "logps/rejected": -217.1889190673828, + "loss": 0.6914, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.013015327975153923, + "rewards/margins": 0.055762697011232376, + "rewards/rejected": -0.042747363448143005, + "step": 2290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9613388486924525e-06, + "logits/chosen": -2.069906234741211, + "logits/rejected": -2.2107555866241455, + "logps/chosen": -184.16355895996094, + "logps/rejected": -209.44485473632812, + "loss": 0.6913, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.040295325219631195, + "rewards/margins": 0.0790117159485817, + "rewards/rejected": -0.11930704116821289, + "step": 2300 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -2.3594470024108887, + "eval_logits/rejected": -2.1697957515716553, + "eval_logps/chosen": -235.0392608642578, + "eval_logps/rejected": -223.16970825195312, + "eval_loss": 0.6907156705856323, + "eval_rewards/accuracies": 0.6169999837875366, + "eval_rewards/chosen": -0.03034323826432228, + "eval_rewards/margins": 0.08523471653461456, + "eval_rewards/rejected": -0.11557795852422714, + "eval_runtime": 709.1569, + "eval_samples_per_second": 2.82, + "eval_steps_per_second": 1.41, + "step": 2300 + }, + { + "epoch": 0.15, + "learning_rate": 4.960332139356834e-06, + "logits/chosen": -2.311826705932617, + "logits/rejected": -2.130279541015625, + "logps/chosen": -211.80313110351562, + "logps/rejected": -195.69424438476562, + "loss": 0.6891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.029862677678465843, + "rewards/margins": 0.10331012308597565, + "rewards/rejected": -0.13317279517650604, + "step": 2310 + }, + { + "epoch": 0.15, + "learning_rate": 4.95931259572357e-06, + "logits/chosen": -2.4080193042755127, + "logits/rejected": -2.043299913406372, + "logps/chosen": -235.7884521484375, + "logps/rejected": -271.4584655761719, + "loss": 0.6888, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.015484524890780449, + "rewards/margins": 0.09996424615383148, + "rewards/rejected": -0.11544877290725708, + "step": 2320 + }, + { + "epoch": 0.15, + "learning_rate": 4.9582802231111e-06, + "logits/chosen": -2.2260050773620605, + "logits/rejected": -2.310255527496338, + "logps/chosen": -211.4370574951172, + "logps/rejected": -191.3184814453125, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004370751790702343, + "rewards/margins": 0.06759954988956451, + "rewards/rejected": -0.06322880834341049, + "step": 2330 + }, + { + "epoch": 0.15, + "learning_rate": 4.957235026904782e-06, + "logits/chosen": -2.393613815307617, + "logits/rejected": -2.0691580772399902, + "logps/chosen": -256.73382568359375, + "logps/rejected": -211.1546173095703, + "loss": 0.69, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05418550223112106, + "rewards/margins": 0.055359721183776855, + "rewards/rejected": -0.001174215809442103, + "step": 2340 + }, + { + "epoch": 0.15, + "learning_rate": 4.956177012556875e-06, + "logits/chosen": -2.478972911834717, + "logits/rejected": -2.2455832958221436, + "logps/chosen": -243.0150604248047, + "logps/rejected": -179.1586456298828, + "loss": 0.6885, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.011777469888329506, + "rewards/margins": 0.0771162286400795, + "rewards/rejected": -0.06533874571323395, + "step": 2350 + }, + { + "epoch": 0.15, + "learning_rate": 4.9551061855864976e-06, + "logits/chosen": -2.150123119354248, + "logits/rejected": -2.202948808670044, + "logps/chosen": -191.73294067382812, + "logps/rejected": -204.2236785888672, + "loss": 0.6902, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.010951901786029339, + "rewards/margins": 0.0828014463186264, + "rewards/rejected": -0.07184954732656479, + "step": 2360 + }, + { + "epoch": 0.16, + "learning_rate": 4.95402255157961e-06, + "logits/chosen": -2.189168930053711, + "logits/rejected": -2.2527401447296143, + "logps/chosen": -182.8531494140625, + "logps/rejected": -255.9742431640625, + "loss": 0.6898, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02203099988400936, + "rewards/margins": 0.08574902266263962, + "rewards/rejected": -0.0637180283665657, + "step": 2370 + }, + { + "epoch": 0.16, + "learning_rate": 4.952926116188977e-06, + "logits/chosen": -2.4717514514923096, + "logits/rejected": -2.3997349739074707, + "logps/chosen": -182.4977264404297, + "logps/rejected": -227.0753173828125, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00879613310098648, + "rewards/margins": 0.05373033136129379, + "rewards/rejected": -0.04493419826030731, + "step": 2380 + }, + { + "epoch": 0.16, + "learning_rate": 4.951816885134143e-06, + "logits/chosen": -2.305720567703247, + "logits/rejected": -2.2849538326263428, + "logps/chosen": -200.81576538085938, + "logps/rejected": -206.62277221679688, + "loss": 0.6914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.007371959276497364, + "rewards/margins": 0.0666775330901146, + "rewards/rejected": -0.059305571019649506, + "step": 2390 + }, + { + "epoch": 0.16, + "learning_rate": 4.950694864201399e-06, + "logits/chosen": -2.3164827823638916, + "logits/rejected": -2.2348294258117676, + "logps/chosen": -236.01382446289062, + "logps/rejected": -246.91506958007812, + "loss": 0.6899, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.026681452989578247, + "rewards/margins": 0.0877201110124588, + "rewards/rejected": -0.06103866174817085, + "step": 2400 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.334512710571289, + "eval_logits/rejected": -2.1472020149230957, + "eval_logps/chosen": -228.88552856445312, + "eval_logps/rejected": -215.46128845214844, + "eval_loss": 0.690426766872406, + "eval_rewards/accuracies": 0.6225000023841858, + "eval_rewards/chosen": 0.031194256618618965, + "eval_rewards/margins": 0.06968805193901062, + "eval_rewards/rejected": -0.038493797183036804, + "eval_runtime": 713.4104, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.402, + "step": 2400 + }, + { + "epoch": 0.16, + "learning_rate": 4.9495600592437575e-06, + "logits/chosen": -2.3851048946380615, + "logits/rejected": -2.2106268405914307, + "logps/chosen": -228.5146484375, + "logps/rejected": -235.97958374023438, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016201911494135857, + "rewards/margins": 0.04283355548977852, + "rewards/rejected": -0.059035468846559525, + "step": 2410 + }, + { + "epoch": 0.16, + "learning_rate": 4.948412476180917e-06, + "logits/chosen": -2.289257526397705, + "logits/rejected": -2.005004405975342, + "logps/chosen": -185.69271850585938, + "logps/rejected": -171.775146484375, + "loss": 0.6909, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.016168467700481415, + "rewards/margins": 0.08819150179624557, + "rewards/rejected": -0.10435996949672699, + "step": 2420 + }, + { + "epoch": 0.16, + "learning_rate": 4.947252120999232e-06, + "logits/chosen": -2.3191158771514893, + "logits/rejected": -2.045474052429199, + "logps/chosen": -267.0201721191406, + "logps/rejected": -212.0325927734375, + "loss": 0.6928, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0008467677980661392, + "rewards/margins": 0.03786264732480049, + "rewards/rejected": -0.037015873938798904, + "step": 2430 + }, + { + "epoch": 0.16, + "learning_rate": 4.946078999751683e-06, + "logits/chosen": -2.22875714302063, + "logits/rejected": -2.160421371459961, + "logps/chosen": -175.55667114257812, + "logps/rejected": -161.97189331054688, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04070456326007843, + "rewards/margins": 0.07971414923667908, + "rewards/rejected": -0.039009593427181244, + "step": 2440 + }, + { + "epoch": 0.16, + "learning_rate": 4.944893118557847e-06, + "logits/chosen": -2.142076015472412, + "logits/rejected": -2.10664439201355, + "logps/chosen": -203.5096893310547, + "logps/rejected": -157.3628692626953, + "loss": 0.6901, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02974940836429596, + "rewards/margins": 0.07543652504682541, + "rewards/rejected": -0.04568710923194885, + "step": 2450 + }, + { + "epoch": 0.16, + "learning_rate": 4.943694483603861e-06, + "logits/chosen": -2.4463329315185547, + "logits/rejected": -2.032438278198242, + "logps/chosen": -222.4589385986328, + "logps/rejected": -179.25404357910156, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.035450879484415054, + "rewards/margins": 0.077740877866745, + "rewards/rejected": -0.04229000210762024, + "step": 2460 + }, + { + "epoch": 0.16, + "learning_rate": 4.9424831011423914e-06, + "logits/chosen": -2.408552408218384, + "logits/rejected": -2.3169188499450684, + "logps/chosen": -288.58917236328125, + "logps/rejected": -253.8071746826172, + "loss": 0.6926, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0016259975964203477, + "rewards/margins": 0.02232646383345127, + "rewards/rejected": -0.023952458053827286, + "step": 2470 + }, + { + "epoch": 0.16, + "learning_rate": 4.9412589774926015e-06, + "logits/chosen": -2.4122543334960938, + "logits/rejected": -2.113245964050293, + "logps/chosen": -273.45263671875, + "logps/rejected": -232.2276611328125, + "loss": 0.6908, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0073710111901164055, + "rewards/margins": 0.09398536384105682, + "rewards/rejected": -0.08661436289548874, + "step": 2480 + }, + { + "epoch": 0.16, + "learning_rate": 4.940022119040121e-06, + "logits/chosen": -2.465616226196289, + "logits/rejected": -2.161163806915283, + "logps/chosen": -287.33282470703125, + "logps/rejected": -265.46343994140625, + "loss": 0.6919, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.045831646770238876, + "rewards/margins": 0.059488773345947266, + "rewards/rejected": -0.013657125644385815, + "step": 2490 + }, + { + "epoch": 0.16, + "learning_rate": 4.93877253223701e-06, + "logits/chosen": -2.417496919631958, + "logits/rejected": -2.174833297729492, + "logps/chosen": -285.5562438964844, + "logps/rejected": -259.3797302246094, + "loss": 0.6924, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.06715986877679825, + "rewards/margins": 0.053962014615535736, + "rewards/rejected": 0.013197846710681915, + "step": 2500 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.3545589447021484, + "eval_logits/rejected": -2.165816068649292, + "eval_logps/chosen": -226.2342071533203, + "eval_logps/rejected": -212.3520965576172, + "eval_loss": 0.6904910802841187, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": 0.05770741403102875, + "eval_rewards/margins": 0.06510914862155914, + "eval_rewards/rejected": -0.007401737384498119, + "eval_runtime": 713.7916, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 2500 + }, + { + "epoch": 0.16, + "learning_rate": 4.937510223601725e-06, + "logits/chosen": -2.5731029510498047, + "logits/rejected": -2.4025304317474365, + "logps/chosen": -254.93716430664062, + "logps/rejected": -217.3448944091797, + "loss": 0.6921, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07588844746351242, + "rewards/margins": 0.04619182273745537, + "rewards/rejected": 0.029696622863411903, + "step": 2510 + }, + { + "epoch": 0.16, + "learning_rate": 4.936235199719085e-06, + "logits/chosen": -2.3957393169403076, + "logits/rejected": -2.2664778232574463, + "logps/chosen": -164.94325256347656, + "logps/rejected": -142.4560089111328, + "loss": 0.6904, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07358762621879578, + "rewards/margins": 0.07691960036754608, + "rewards/rejected": -0.0033319753129035234, + "step": 2520 + }, + { + "epoch": 0.17, + "learning_rate": 4.93494746724024e-06, + "logits/chosen": -2.384202480316162, + "logits/rejected": -2.121302843093872, + "logps/chosen": -218.81015014648438, + "logps/rejected": -243.3566436767578, + "loss": 0.6902, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0499432310461998, + "rewards/margins": 0.07086510956287384, + "rewards/rejected": -0.020921876654028893, + "step": 2530 + }, + { + "epoch": 0.17, + "learning_rate": 4.933647032882635e-06, + "logits/chosen": -2.5266172885894775, + "logits/rejected": -2.211371898651123, + "logps/chosen": -236.7266082763672, + "logps/rejected": -201.0948486328125, + "loss": 0.6897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0813523381948471, + "rewards/margins": 0.08787768334150314, + "rewards/rejected": -0.0065253423526883125, + "step": 2540 + }, + { + "epoch": 0.17, + "learning_rate": 4.932333903429969e-06, + "logits/chosen": -2.205667495727539, + "logits/rejected": -2.058753728866577, + "logps/chosen": -188.5623779296875, + "logps/rejected": -162.65000915527344, + "loss": 0.6944, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.045201074331998825, + "rewards/margins": -0.001063268631696701, + "rewards/rejected": 0.046264342963695526, + "step": 2550 + }, + { + "epoch": 0.17, + "learning_rate": 4.931008085732172e-06, + "logits/chosen": -2.3989763259887695, + "logits/rejected": -1.9831393957138062, + "logps/chosen": -197.83444213867188, + "logps/rejected": -153.06373596191406, + "loss": 0.692, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.04988477751612663, + "rewards/margins": 0.05230678245425224, + "rewards/rejected": -0.0024220068007707596, + "step": 2560 + }, + { + "epoch": 0.17, + "learning_rate": 4.9296695867053565e-06, + "logits/chosen": -2.357609510421753, + "logits/rejected": -2.1423609256744385, + "logps/chosen": -294.01385498046875, + "logps/rejected": -234.3355712890625, + "loss": 0.6911, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06587690114974976, + "rewards/margins": 0.06281637400388718, + "rewards/rejected": 0.00306052272208035, + "step": 2570 + }, + { + "epoch": 0.17, + "learning_rate": 4.928318413331791e-06, + "logits/chosen": -2.4487571716308594, + "logits/rejected": -2.233754873275757, + "logps/chosen": -205.7588653564453, + "logps/rejected": -195.708251953125, + "loss": 0.6916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.054682277143001556, + "rewards/margins": 0.05934653431177139, + "rewards/rejected": -0.004664266016334295, + "step": 2580 + }, + { + "epoch": 0.17, + "learning_rate": 4.926954572659855e-06, + "logits/chosen": -2.223047971725464, + "logits/rejected": -2.236845016479492, + "logps/chosen": -234.7522430419922, + "logps/rejected": -261.69873046875, + "loss": 0.6893, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.07443410903215408, + "rewards/margins": 0.09760646522045135, + "rewards/rejected": -0.02317235991358757, + "step": 2590 + }, + { + "epoch": 0.17, + "learning_rate": 4.925578071804013e-06, + "logits/chosen": -2.245145320892334, + "logits/rejected": -2.1898436546325684, + "logps/chosen": -227.51473999023438, + "logps/rejected": -292.41217041015625, + "loss": 0.6893, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.021910278126597404, + "rewards/margins": 0.07485561072826385, + "rewards/rejected": -0.052945323288440704, + "step": 2600 + }, + { + "epoch": 0.17, + "eval_logits/chosen": -2.3452558517456055, + "eval_logits/rejected": -2.157046318054199, + "eval_logps/chosen": -226.8026885986328, + "eval_logps/rejected": -213.66268920898438, + "eval_loss": 0.6903373003005981, + "eval_rewards/accuracies": 0.6320000290870667, + "eval_rewards/chosen": 0.05202279984951019, + "eval_rewards/margins": 0.07253072410821915, + "eval_rewards/rejected": -0.020507927983999252, + "eval_runtime": 710.3899, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.408, + "step": 2600 + }, + { + "epoch": 0.17, + "learning_rate": 4.924188917944763e-06, + "logits/chosen": -2.412496328353882, + "logits/rejected": -2.234360456466675, + "logps/chosen": -214.3735809326172, + "logps/rejected": -191.68753051757812, + "loss": 0.6881, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05631525442004204, + "rewards/margins": 0.10834388434886932, + "rewards/rejected": -0.052028633654117584, + "step": 2610 + }, + { + "epoch": 0.17, + "learning_rate": 4.922787118328617e-06, + "logits/chosen": -2.432962656021118, + "logits/rejected": -2.1155600547790527, + "logps/chosen": -226.4817657470703, + "logps/rejected": -155.4060516357422, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03535914793610573, + "rewards/margins": 0.052371758967638016, + "rewards/rejected": -0.01701260730624199, + "step": 2620 + }, + { + "epoch": 0.17, + "learning_rate": 4.921372680268045e-06, + "logits/chosen": -2.411449909210205, + "logits/rejected": -2.0770812034606934, + "logps/chosen": -228.71273803710938, + "logps/rejected": -201.04159545898438, + "loss": 0.6936, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.012919628992676735, + "rewards/margins": 0.026976149529218674, + "rewards/rejected": -0.01405651867389679, + "step": 2630 + }, + { + "epoch": 0.17, + "learning_rate": 4.919945611141451e-06, + "logits/chosen": -2.459571361541748, + "logits/rejected": -2.078706979751587, + "logps/chosen": -218.890869140625, + "logps/rejected": -165.82656860351562, + "loss": 0.6891, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.06343764066696167, + "rewards/margins": 0.05455031245946884, + "rewards/rejected": 0.008887320756912231, + "step": 2640 + }, + { + "epoch": 0.17, + "learning_rate": 4.918505918393125e-06, + "logits/chosen": -2.321850538253784, + "logits/rejected": -2.1715524196624756, + "logps/chosen": -163.71751403808594, + "logps/rejected": -194.63999938964844, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05313184857368469, + "rewards/margins": 0.07549114525318146, + "rewards/rejected": -0.022359298542141914, + "step": 2650 + }, + { + "epoch": 0.17, + "learning_rate": 4.91705360953321e-06, + "logits/chosen": -2.376260995864868, + "logits/rejected": -2.14408540725708, + "logps/chosen": -244.6078643798828, + "logps/rejected": -220.7104034423828, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.008797365240752697, + "rewards/margins": 0.08757869899272919, + "rewards/rejected": -0.07878134399652481, + "step": 2660 + }, + { + "epoch": 0.17, + "learning_rate": 4.9155886921376615e-06, + "logits/chosen": -2.2897629737854004, + "logits/rejected": -2.244919776916504, + "logps/chosen": -208.5734405517578, + "logps/rejected": -236.90902709960938, + "loss": 0.6925, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.026050258427858353, + "rewards/margins": 0.06667254120111465, + "rewards/rejected": -0.0927228033542633, + "step": 2670 + }, + { + "epoch": 0.18, + "learning_rate": 4.914111173848205e-06, + "logits/chosen": -2.3279356956481934, + "logits/rejected": -2.280183792114258, + "logps/chosen": -237.3335418701172, + "logps/rejected": -226.715576171875, + "loss": 0.6918, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04301954060792923, + "rewards/margins": 0.04799992963671684, + "rewards/rejected": -0.09101946651935577, + "step": 2680 + }, + { + "epoch": 0.18, + "learning_rate": 4.9126210623723e-06, + "logits/chosen": -2.1070661544799805, + "logits/rejected": -2.2766873836517334, + "logps/chosen": -201.4837646484375, + "logps/rejected": -244.10855102539062, + "loss": 0.6893, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.011103630065917969, + "rewards/margins": 0.08852796256542206, + "rewards/rejected": -0.09963159263134003, + "step": 2690 + }, + { + "epoch": 0.18, + "learning_rate": 4.911118365483098e-06, + "logits/chosen": -2.2129688262939453, + "logits/rejected": -2.2985918521881104, + "logps/chosen": -202.52978515625, + "logps/rejected": -220.18173217773438, + "loss": 0.6901, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.007755742873996496, + "rewards/margins": 0.08225713670253754, + "rewards/rejected": -0.09001287072896957, + "step": 2700 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.3249430656433105, + "eval_logits/rejected": -2.138172149658203, + "eval_logps/chosen": -231.6273651123047, + "eval_logps/rejected": -217.83657836914062, + "eval_loss": 0.6905510425567627, + "eval_rewards/accuracies": 0.6324999928474426, + "eval_rewards/chosen": 0.003775849472731352, + "eval_rewards/margins": 0.06602264940738678, + "eval_rewards/rejected": -0.06224680691957474, + "eval_runtime": 712.2335, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 2700 + }, + { + "epoch": 0.18, + "learning_rate": 4.909603091019403e-06, + "logits/chosen": -2.4967615604400635, + "logits/rejected": -2.1264119148254395, + "logps/chosen": -232.87014770507812, + "logps/rejected": -201.87669372558594, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03507193177938461, + "rewards/margins": 0.05626847222447395, + "rewards/rejected": -0.021196534857153893, + "step": 2710 + }, + { + "epoch": 0.18, + "learning_rate": 4.908075246885626e-06, + "logits/chosen": -2.2765352725982666, + "logits/rejected": -2.204392910003662, + "logps/chosen": -149.35107421875, + "logps/rejected": -125.10282135009766, + "loss": 0.6928, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.01376400887966156, + "rewards/margins": 0.03143042325973511, + "rewards/rejected": -0.017666416242718697, + "step": 2720 + }, + { + "epoch": 0.18, + "learning_rate": 4.906534841051755e-06, + "logits/chosen": -2.138417959213257, + "logits/rejected": -2.202500820159912, + "logps/chosen": -241.3466339111328, + "logps/rejected": -257.94903564453125, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02056839130818844, + "rewards/margins": 0.055667709559202194, + "rewards/rejected": -0.03509931638836861, + "step": 2730 + }, + { + "epoch": 0.18, + "learning_rate": 4.904981881553297e-06, + "logits/chosen": -2.4024128913879395, + "logits/rejected": -2.0706582069396973, + "logps/chosen": -223.4213409423828, + "logps/rejected": -164.6261444091797, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007749582640826702, + "rewards/margins": 0.05322981998324394, + "rewards/rejected": -0.04548024386167526, + "step": 2740 + }, + { + "epoch": 0.18, + "learning_rate": 4.903416376491252e-06, + "logits/chosen": -2.4039175510406494, + "logits/rejected": -2.008875608444214, + "logps/chosen": -277.7300720214844, + "logps/rejected": -245.2926788330078, + "loss": 0.6898, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.005621733609586954, + "rewards/margins": 0.08154728263616562, + "rewards/rejected": -0.0759255513548851, + "step": 2750 + }, + { + "epoch": 0.18, + "learning_rate": 4.90183833403206e-06, + "logits/chosen": -2.467904567718506, + "logits/rejected": -2.3105967044830322, + "logps/chosen": -261.31292724609375, + "logps/rejected": -234.25772094726562, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.029001805931329727, + "rewards/margins": 0.09197796136140823, + "rewards/rejected": -0.0629761591553688, + "step": 2760 + }, + { + "epoch": 0.18, + "learning_rate": 4.900247762407564e-06, + "logits/chosen": -2.228250026702881, + "logits/rejected": -2.0147242546081543, + "logps/chosen": -173.1147003173828, + "logps/rejected": -203.9661407470703, + "loss": 0.6878, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.008227216079831123, + "rewards/margins": 0.08808780461549759, + "rewards/rejected": -0.09631501138210297, + "step": 2770 + }, + { + "epoch": 0.18, + "learning_rate": 4.898644669914965e-06, + "logits/chosen": -2.2914719581604004, + "logits/rejected": -2.2036476135253906, + "logps/chosen": -222.91055297851562, + "logps/rejected": -216.57778930664062, + "loss": 0.6908, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.01161886565387249, + "rewards/margins": 0.07493311911821365, + "rewards/rejected": -0.06331426650285721, + "step": 2780 + }, + { + "epoch": 0.18, + "learning_rate": 4.897029064916778e-06, + "logits/chosen": -2.0988786220550537, + "logits/rejected": -1.9013208150863647, + "logps/chosen": -208.904541015625, + "logps/rejected": -201.63589477539062, + "loss": 0.6919, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.007546191103756428, + "rewards/margins": 0.051201723515987396, + "rewards/rejected": -0.058747924864292145, + "step": 2790 + }, + { + "epoch": 0.18, + "learning_rate": 4.895400955840791e-06, + "logits/chosen": -2.459437847137451, + "logits/rejected": -1.807782769203186, + "logps/chosen": -215.23831176757812, + "logps/rejected": -182.17108154296875, + "loss": 0.6909, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05112996697425842, + "rewards/margins": 0.08630537241697311, + "rewards/rejected": -0.035175397992134094, + "step": 2800 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.3020472526550293, + "eval_logits/rejected": -2.1165366172790527, + "eval_logps/chosen": -228.67945861816406, + "eval_logps/rejected": -215.24508666992188, + "eval_loss": 0.6903337240219116, + "eval_rewards/accuracies": 0.6315000057220459, + "eval_rewards/chosen": 0.03325507417321205, + "eval_rewards/margins": 0.0695870891213417, + "eval_rewards/rejected": -0.03633202239871025, + "eval_runtime": 710.6994, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 2800 + }, + { + "epoch": 0.18, + "learning_rate": 4.893760351180018e-06, + "logits/chosen": -2.3032994270324707, + "logits/rejected": -2.2417054176330566, + "logps/chosen": -195.91603088378906, + "logps/rejected": -206.2035675048828, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.026252543553709984, + "rewards/margins": 0.05186532065272331, + "rewards/rejected": -0.02561277151107788, + "step": 2810 + }, + { + "epoch": 0.18, + "learning_rate": 4.892107259492657e-06, + "logits/chosen": -2.2603695392608643, + "logits/rejected": -2.022012233734131, + "logps/chosen": -237.22506713867188, + "logps/rejected": -246.58621215820312, + "loss": 0.6919, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.026342108845710754, + "rewards/margins": 0.03342199698090553, + "rewards/rejected": -0.0070798941887915134, + "step": 2820 + }, + { + "epoch": 0.19, + "learning_rate": 4.890441689402042e-06, + "logits/chosen": -2.3950798511505127, + "logits/rejected": -2.2143449783325195, + "logps/chosen": -331.57049560546875, + "logps/rejected": -291.9052734375, + "loss": 0.6898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03934413939714432, + "rewards/margins": 0.08334746211767197, + "rewards/rejected": -0.04400331899523735, + "step": 2830 + }, + { + "epoch": 0.19, + "learning_rate": 4.888763649596606e-06, + "logits/chosen": -2.402956485748291, + "logits/rejected": -2.176975965499878, + "logps/chosen": -208.06912231445312, + "logps/rejected": -206.9716339111328, + "loss": 0.69, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.020925721153616905, + "rewards/margins": 0.07703931629657745, + "rewards/rejected": -0.056113600730895996, + "step": 2840 + }, + { + "epoch": 0.19, + "learning_rate": 4.887073148829824e-06, + "logits/chosen": -2.3179640769958496, + "logits/rejected": -2.170621871948242, + "logps/chosen": -262.16778564453125, + "logps/rejected": -246.90365600585938, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04735777527093887, + "rewards/margins": 0.07671411335468292, + "rewards/rejected": -0.029356345534324646, + "step": 2850 + }, + { + "epoch": 0.19, + "learning_rate": 4.885370195920177e-06, + "logits/chosen": -2.199707508087158, + "logits/rejected": -2.139965057373047, + "logps/chosen": -185.32675170898438, + "logps/rejected": -182.80758666992188, + "loss": 0.6918, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.044344521185849e-05, + "rewards/margins": 0.060539864003658295, + "rewards/rejected": -0.060570307075977325, + "step": 2860 + }, + { + "epoch": 0.19, + "learning_rate": 4.883654799751101e-06, + "logits/chosen": -2.105257749557495, + "logits/rejected": -2.3305790424346924, + "logps/chosen": -212.72607421875, + "logps/rejected": -253.13320922851562, + "loss": 0.6914, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.029830992221832275, + "rewards/margins": 0.056669920682907104, + "rewards/rejected": -0.026838932186365128, + "step": 2870 + }, + { + "epoch": 0.19, + "learning_rate": 4.8819269692709435e-06, + "logits/chosen": -2.4003872871398926, + "logits/rejected": -2.227214813232422, + "logps/chosen": -263.90191650390625, + "logps/rejected": -199.56661987304688, + "loss": 0.6899, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03144312649965286, + "rewards/margins": 0.08026852458715439, + "rewards/rejected": -0.048825401812791824, + "step": 2880 + }, + { + "epoch": 0.19, + "learning_rate": 4.880186713492915e-06, + "logits/chosen": -2.2545135021209717, + "logits/rejected": -2.066854238510132, + "logps/chosen": -227.1837921142578, + "logps/rejected": -178.45132446289062, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005335694644600153, + "rewards/margins": 0.04889502376317978, + "rewards/rejected": -0.043559327721595764, + "step": 2890 + }, + { + "epoch": 0.19, + "learning_rate": 4.878434041495041e-06, + "logits/chosen": -2.2972848415374756, + "logits/rejected": -2.416718006134033, + "logps/chosen": -233.84445190429688, + "logps/rejected": -244.32199096679688, + "loss": 0.6893, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.033212922513484955, + "rewards/margins": 0.08615333586931229, + "rewards/rejected": -0.05294041708111763, + "step": 2900 + }, + { + "epoch": 0.19, + "eval_logits/chosen": -2.296010971069336, + "eval_logits/rejected": -2.1109728813171387, + "eval_logps/chosen": -230.9010009765625, + "eval_logps/rejected": -217.73272705078125, + "eval_loss": 0.6902217268943787, + "eval_rewards/accuracies": 0.6380000114440918, + "eval_rewards/chosen": 0.011039442382752895, + "eval_rewards/margins": 0.07224779576063156, + "eval_rewards/rejected": -0.06120835244655609, + "eval_runtime": 712.9014, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 2900 + }, + { + "epoch": 0.19, + "learning_rate": 4.876668962420117e-06, + "logits/chosen": -2.2779107093811035, + "logits/rejected": -2.0205576419830322, + "logps/chosen": -285.90301513671875, + "logps/rejected": -234.1432342529297, + "loss": 0.6918, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0374809131026268, + "rewards/margins": 0.07856379449367523, + "rewards/rejected": -0.04108288139104843, + "step": 2910 + }, + { + "epoch": 0.19, + "learning_rate": 4.87489148547566e-06, + "logits/chosen": -2.2894115447998047, + "logits/rejected": -2.142322540283203, + "logps/chosen": -261.393310546875, + "logps/rejected": -233.22802734375, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.015348220244050026, + "rewards/margins": 0.050076454877853394, + "rewards/rejected": -0.06542467325925827, + "step": 2920 + }, + { + "epoch": 0.19, + "learning_rate": 4.873101619933862e-06, + "logits/chosen": -2.5051543712615967, + "logits/rejected": -2.1486945152282715, + "logps/chosen": -263.4872131347656, + "logps/rejected": -221.30996704101562, + "loss": 0.6895, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.010399745777249336, + "rewards/margins": 0.07138345390558243, + "rewards/rejected": -0.06098370626568794, + "step": 2930 + }, + { + "epoch": 0.19, + "learning_rate": 4.8712993751315385e-06, + "logits/chosen": -2.283648729324341, + "logits/rejected": -2.2247979640960693, + "logps/chosen": -120.84417724609375, + "logps/rejected": -128.2853240966797, + "loss": 0.6913, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.001862399629317224, + "rewards/margins": 0.03539283573627472, + "rewards/rejected": -0.03725523501634598, + "step": 2940 + }, + { + "epoch": 0.19, + "learning_rate": 4.869484760470079e-06, + "logits/chosen": -2.3379874229431152, + "logits/rejected": -2.1309611797332764, + "logps/chosen": -187.4462127685547, + "logps/rejected": -158.68853759765625, + "loss": 0.6884, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.012146204710006714, + "rewards/margins": 0.07541505247354507, + "rewards/rejected": -0.06326885521411896, + "step": 2950 + }, + { + "epoch": 0.19, + "learning_rate": 4.867657785415404e-06, + "logits/chosen": -2.2649407386779785, + "logits/rejected": -1.9867734909057617, + "logps/chosen": -250.11520385742188, + "logps/rejected": -217.9238739013672, + "loss": 0.6893, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.010161913931369781, + "rewards/margins": 0.09553287923336029, + "rewards/rejected": -0.10569479316473007, + "step": 2960 + }, + { + "epoch": 0.19, + "learning_rate": 4.865818459497911e-06, + "logits/chosen": -2.49599027633667, + "logits/rejected": -2.0337436199188232, + "logps/chosen": -284.77001953125, + "logps/rejected": -206.2366180419922, + "loss": 0.6891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00591338612139225, + "rewards/margins": 0.0626380667090416, + "rewards/rejected": -0.0685514584183693, + "step": 2970 + }, + { + "epoch": 0.19, + "learning_rate": 4.863966792312423e-06, + "logits/chosen": -2.382023811340332, + "logits/rejected": -2.142746686935425, + "logps/chosen": -239.0797576904297, + "logps/rejected": -208.9804229736328, + "loss": 0.6889, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.023791249841451645, + "rewards/margins": 0.10952029377222061, + "rewards/rejected": -0.08572904765605927, + "step": 2980 + }, + { + "epoch": 0.2, + "learning_rate": 4.862102793518145e-06, + "logits/chosen": -2.2269492149353027, + "logits/rejected": -2.290496349334717, + "logps/chosen": -194.52423095703125, + "logps/rejected": -209.1487274169922, + "loss": 0.6884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0024234852753579617, + "rewards/margins": 0.07699505239725113, + "rewards/rejected": -0.07941852509975433, + "step": 2990 + }, + { + "epoch": 0.2, + "learning_rate": 4.8602264728386075e-06, + "logits/chosen": -2.3325839042663574, + "logits/rejected": -2.184682607650757, + "logps/chosen": -252.10238647460938, + "logps/rejected": -256.3367919921875, + "loss": 0.6925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.00397127028554678, + "rewards/margins": 0.07653030008077621, + "rewards/rejected": -0.07255902886390686, + "step": 3000 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -2.3181509971618652, + "eval_logits/rejected": -2.131211280822754, + "eval_logps/chosen": -230.46095275878906, + "eval_logps/rejected": -218.1745147705078, + "eval_loss": 0.6903403401374817, + "eval_rewards/accuracies": 0.6244999766349792, + "eval_rewards/chosen": 0.015439935959875584, + "eval_rewards/margins": 0.08106595277786255, + "eval_rewards/rejected": -0.06562602519989014, + "eval_runtime": 709.3669, + "eval_samples_per_second": 2.819, + "eval_steps_per_second": 1.41, + "step": 3000 + }, + { + "epoch": 0.2, + "learning_rate": 4.858337840061616e-06, + "logits/chosen": -2.309683084487915, + "logits/rejected": -2.230560779571533, + "logps/chosen": -180.07546997070312, + "logps/rejected": -241.62252807617188, + "loss": 0.6908, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.030049040913581848, + "rewards/margins": 0.07897917181253433, + "rewards/rejected": -0.048930130898952484, + "step": 3010 + }, + { + "epoch": 0.2, + "learning_rate": 4.856436905039208e-06, + "logits/chosen": -2.3335537910461426, + "logits/rejected": -2.174056053161621, + "logps/chosen": -207.0081024169922, + "logps/rejected": -181.50457763671875, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04196963459253311, + "rewards/margins": 0.08883820474147797, + "rewards/rejected": -0.046868570148944855, + "step": 3020 + }, + { + "epoch": 0.2, + "learning_rate": 4.854523677687588e-06, + "logits/chosen": -2.2027428150177, + "logits/rejected": -2.271785259246826, + "logps/chosen": -176.38583374023438, + "logps/rejected": -201.62588500976562, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.048596903681755066, + "rewards/margins": 0.06843477487564087, + "rewards/rejected": -0.0198378786444664, + "step": 3030 + }, + { + "epoch": 0.2, + "learning_rate": 4.85259816798709e-06, + "logits/chosen": -2.4263253211975098, + "logits/rejected": -1.8797962665557861, + "logps/chosen": -281.78717041015625, + "logps/rejected": -212.32394409179688, + "loss": 0.6909, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.059749238193035126, + "rewards/margins": 0.11092700809240341, + "rewards/rejected": -0.05117777734994888, + "step": 3040 + }, + { + "epoch": 0.2, + "learning_rate": 4.850660385982114e-06, + "logits/chosen": -2.4107866287231445, + "logits/rejected": -2.2419321537017822, + "logps/chosen": -243.84481811523438, + "logps/rejected": -195.49806213378906, + "loss": 0.6884, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0387430340051651, + "rewards/margins": 0.06934549659490585, + "rewards/rejected": -0.030602458864450455, + "step": 3050 + }, + { + "epoch": 0.2, + "learning_rate": 4.848710341781081e-06, + "logits/chosen": -2.152615547180176, + "logits/rejected": -2.259021043777466, + "logps/chosen": -176.58517456054688, + "logps/rejected": -175.9096221923828, + "loss": 0.6923, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.026836439967155457, + "rewards/margins": 0.050511687994003296, + "rewards/rejected": -0.07734812796115875, + "step": 3060 + }, + { + "epoch": 0.2, + "learning_rate": 4.846748045556377e-06, + "logits/chosen": -2.3441312313079834, + "logits/rejected": -2.0264244079589844, + "logps/chosen": -239.79904174804688, + "logps/rejected": -186.3507537841797, + "loss": 0.6912, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.003119309199973941, + "rewards/margins": 0.07062678039073944, + "rewards/rejected": -0.07374609261751175, + "step": 3070 + }, + { + "epoch": 0.2, + "learning_rate": 4.8447735075442995e-06, + "logits/chosen": -2.2217564582824707, + "logits/rejected": -2.284585952758789, + "logps/chosen": -201.0135498046875, + "logps/rejected": -206.20608520507812, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03451332077383995, + "rewards/margins": 0.08593029528856277, + "rewards/rejected": -0.12044362723827362, + "step": 3080 + }, + { + "epoch": 0.2, + "learning_rate": 4.8427867380450075e-06, + "logits/chosen": -2.406268835067749, + "logits/rejected": -2.0118308067321777, + "logps/chosen": -234.0061798095703, + "logps/rejected": -197.52023315429688, + "loss": 0.6894, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.036669012159109116, + "rewards/margins": 0.09763816744089127, + "rewards/rejected": -0.13430717587471008, + "step": 3090 + }, + { + "epoch": 0.2, + "learning_rate": 4.840787747422462e-06, + "logits/chosen": -2.3698010444641113, + "logits/rejected": -2.114318609237671, + "logps/chosen": -199.02552795410156, + "logps/rejected": -175.43988037109375, + "loss": 0.692, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.033694010227918625, + "rewards/margins": 0.06671958416700363, + "rewards/rejected": -0.10041359812021255, + "step": 3100 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -2.3160300254821777, + "eval_logits/rejected": -2.129782199859619, + "eval_logps/chosen": -235.4629669189453, + "eval_logps/rejected": -223.5566864013672, + "eval_loss": 0.6903056502342224, + "eval_rewards/accuracies": 0.6439999938011169, + "eval_rewards/chosen": -0.034580256789922714, + "eval_rewards/margins": 0.08486771583557129, + "eval_rewards/rejected": -0.1194479689002037, + "eval_runtime": 712.862, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 3100 + }, + { + "epoch": 0.2, + "learning_rate": 4.838776546104378e-06, + "logits/chosen": -2.2874035835266113, + "logits/rejected": -2.2883827686309814, + "logps/chosen": -282.4599609375, + "logps/rejected": -252.1973876953125, + "loss": 0.6882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.015067142434418201, + "rewards/margins": 0.10139371454715729, + "rewards/rejected": -0.11646085977554321, + "step": 3110 + }, + { + "epoch": 0.2, + "learning_rate": 4.836753144582168e-06, + "logits/chosen": -2.2503442764282227, + "logits/rejected": -2.0378506183624268, + "logps/chosen": -245.8728790283203, + "logps/rejected": -234.16201782226562, + "loss": 0.6882, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03641275316476822, + "rewards/margins": 0.12401758134365082, + "rewards/rejected": -0.16043034195899963, + "step": 3120 + }, + { + "epoch": 0.2, + "learning_rate": 4.834717553410884e-06, + "logits/chosen": -2.3153603076934814, + "logits/rejected": -2.077373743057251, + "logps/chosen": -190.7818145751953, + "logps/rejected": -213.3385009765625, + "loss": 0.6909, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.015058299526572227, + "rewards/margins": 0.08887463808059692, + "rewards/rejected": -0.1039329394698143, + "step": 3130 + }, + { + "epoch": 0.21, + "learning_rate": 4.832669783209167e-06, + "logits/chosen": -2.192064046859741, + "logits/rejected": -2.241379499435425, + "logps/chosen": -245.5317840576172, + "logps/rejected": -248.6998291015625, + "loss": 0.6933, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.011057475581765175, + "rewards/margins": 0.026191571727395058, + "rewards/rejected": -0.03724905103445053, + "step": 3140 + }, + { + "epoch": 0.21, + "learning_rate": 4.8306098446591895e-06, + "logits/chosen": -1.8934345245361328, + "logits/rejected": -2.0603950023651123, + "logps/chosen": -177.72357177734375, + "logps/rejected": -208.8456573486328, + "loss": 0.6898, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.014547166414558887, + "rewards/margins": 0.06190754845738411, + "rewards/rejected": -0.07645471394062042, + "step": 3150 + }, + { + "epoch": 0.21, + "learning_rate": 4.828537748506601e-06, + "logits/chosen": -2.411770820617676, + "logits/rejected": -2.1453702449798584, + "logps/chosen": -273.4091796875, + "logps/rejected": -221.22647094726562, + "loss": 0.6925, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00010936595208477229, + "rewards/margins": 0.04063498228788376, + "rewards/rejected": -0.040744349360466, + "step": 3160 + }, + { + "epoch": 0.21, + "learning_rate": 4.826453505560469e-06, + "logits/chosen": -2.0967440605163574, + "logits/rejected": -2.0667638778686523, + "logps/chosen": -192.63839721679688, + "logps/rejected": -184.79087829589844, + "loss": 0.6903, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.016592925414443016, + "rewards/margins": 0.05849025771021843, + "rewards/rejected": -0.07508319616317749, + "step": 3170 + }, + { + "epoch": 0.21, + "learning_rate": 4.824357126693226e-06, + "logits/chosen": -2.206259250640869, + "logits/rejected": -1.8263355493545532, + "logps/chosen": -260.7179260253906, + "logps/rejected": -217.93618774414062, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01974749192595482, + "rewards/margins": 0.05945660546422005, + "rewards/rejected": -0.07920410484075546, + "step": 3180 + }, + { + "epoch": 0.21, + "learning_rate": 4.8222486228406105e-06, + "logits/chosen": -2.3845696449279785, + "logits/rejected": -2.098829746246338, + "logps/chosen": -211.3022003173828, + "logps/rejected": -183.42645263671875, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0006182378274388611, + "rewards/margins": 0.06805343925952911, + "rewards/rejected": -0.06867166608572006, + "step": 3190 + }, + { + "epoch": 0.21, + "learning_rate": 4.820128005001612e-06, + "logits/chosen": -2.0764338970184326, + "logits/rejected": -1.9985427856445312, + "logps/chosen": -217.34097290039062, + "logps/rejected": -208.50668334960938, + "loss": 0.687, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.016526032239198685, + "rewards/margins": 0.12708571553230286, + "rewards/rejected": -0.11055967956781387, + "step": 3200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.3211934566497803, + "eval_logits/rejected": -2.1343932151794434, + "eval_logps/chosen": -233.4681854248047, + "eval_logps/rejected": -220.65005493164062, + "eval_loss": 0.6902753114700317, + "eval_rewards/accuracies": 0.6209999918937683, + "eval_rewards/chosen": -0.014632347039878368, + "eval_rewards/margins": 0.07574935257434845, + "eval_rewards/rejected": -0.0903816968202591, + "eval_runtime": 710.7834, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 3200 + }, + { + "epoch": 0.21, + "learning_rate": 4.817995284238412e-06, + "logits/chosen": -2.1152288913726807, + "logits/rejected": -2.1994452476501465, + "logps/chosen": -198.7126007080078, + "logps/rejected": -236.6934356689453, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021882567554712296, + "rewards/margins": 0.10495994985103607, + "rewards/rejected": -0.12684252858161926, + "step": 3210 + }, + { + "epoch": 0.21, + "learning_rate": 4.815850471676327e-06, + "logits/chosen": -2.2534170150756836, + "logits/rejected": -2.134138822555542, + "logps/chosen": -238.7698211669922, + "logps/rejected": -244.8793487548828, + "loss": 0.6888, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.003558219876140356, + "rewards/margins": 0.10356787592172623, + "rewards/rejected": -0.10712607949972153, + "step": 3220 + }, + { + "epoch": 0.21, + "learning_rate": 4.813693578503751e-06, + "logits/chosen": -2.303338050842285, + "logits/rejected": -2.123116970062256, + "logps/chosen": -295.8538513183594, + "logps/rejected": -247.59762573242188, + "loss": 0.6913, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0065292296931147575, + "rewards/margins": 0.072014220058918, + "rewards/rejected": -0.06548498570919037, + "step": 3230 + }, + { + "epoch": 0.21, + "learning_rate": 4.811524615972093e-06, + "logits/chosen": -2.3142409324645996, + "logits/rejected": -2.1741249561309814, + "logps/chosen": -230.9072265625, + "logps/rejected": -246.0470733642578, + "loss": 0.6899, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009716503322124481, + "rewards/margins": 0.06937507539987564, + "rewards/rejected": -0.07909159362316132, + "step": 3240 + }, + { + "epoch": 0.21, + "learning_rate": 4.809343595395724e-06, + "logits/chosen": -2.51108455657959, + "logits/rejected": -2.3194468021392822, + "logps/chosen": -191.16067504882812, + "logps/rejected": -164.18856811523438, + "loss": 0.6909, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.021050242707133293, + "rewards/margins": 0.03616604954004288, + "rewards/rejected": -0.05721629410982132, + "step": 3250 + }, + { + "epoch": 0.21, + "learning_rate": 4.807150528151918e-06, + "logits/chosen": -2.336385726928711, + "logits/rejected": -2.1668715476989746, + "logps/chosen": -166.63986206054688, + "logps/rejected": -193.17758178710938, + "loss": 0.6889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0005269769462756813, + "rewards/margins": 0.0867081880569458, + "rewards/rejected": -0.08618120849132538, + "step": 3260 + }, + { + "epoch": 0.21, + "learning_rate": 4.804945425680787e-06, + "logits/chosen": -2.288424253463745, + "logits/rejected": -2.3639869689941406, + "logps/chosen": -190.94607543945312, + "logps/rejected": -174.963134765625, + "loss": 0.6925, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.025028562173247337, + "rewards/margins": 0.04095301777124405, + "rewards/rejected": -0.06598157435655594, + "step": 3270 + }, + { + "epoch": 0.21, + "learning_rate": 4.802728299485225e-06, + "logits/chosen": -2.146742343902588, + "logits/rejected": -2.1346192359924316, + "logps/chosen": -153.60440063476562, + "logps/rejected": -180.19187927246094, + "loss": 0.6901, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.02999986708164215, + "rewards/margins": 0.050558023154735565, + "rewards/rejected": -0.08055789768695831, + "step": 3280 + }, + { + "epoch": 0.22, + "learning_rate": 4.8004991611308495e-06, + "logits/chosen": -2.4337520599365234, + "logits/rejected": -2.1402642726898193, + "logps/chosen": -240.08847045898438, + "logps/rejected": -231.9585418701172, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.014652663841843605, + "rewards/margins": 0.07920269668102264, + "rewards/rejected": -0.06455003470182419, + "step": 3290 + }, + { + "epoch": 0.22, + "learning_rate": 4.798258022245937e-06, + "logits/chosen": -2.3729190826416016, + "logits/rejected": -1.9661096334457397, + "logps/chosen": -218.466064453125, + "logps/rejected": -187.5246124267578, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00859595276415348, + "rewards/margins": 0.07523629814386368, + "rewards/rejected": -0.08383224904537201, + "step": 3300 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -2.309415817260742, + "eval_logits/rejected": -2.1233413219451904, + "eval_logps/chosen": -232.6118621826172, + "eval_logps/rejected": -220.64340209960938, + "eval_loss": 0.6902174353599548, + "eval_rewards/accuracies": 0.6420000195503235, + "eval_rewards/chosen": -0.006068930495530367, + "eval_rewards/margins": 0.08424630761146545, + "eval_rewards/rejected": -0.09031523764133453, + "eval_runtime": 712.8632, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 3300 + }, + { + "epoch": 0.22, + "learning_rate": 4.796004894521365e-06, + "logits/chosen": -2.3003628253936768, + "logits/rejected": -2.1167216300964355, + "logps/chosen": -230.6715850830078, + "logps/rejected": -260.5556640625, + "loss": 0.6903, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.021018046885728836, + "rewards/margins": 0.08627601712942123, + "rewards/rejected": -0.10729406774044037, + "step": 3310 + }, + { + "epoch": 0.22, + "learning_rate": 4.7937397897105545e-06, + "logits/chosen": -2.290663242340088, + "logits/rejected": -2.2099320888519287, + "logps/chosen": -203.26271057128906, + "logps/rejected": -182.44137573242188, + "loss": 0.6928, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.019701208919286728, + "rewards/margins": 0.043921031057834625, + "rewards/rejected": -0.024219822138547897, + "step": 3320 + }, + { + "epoch": 0.22, + "learning_rate": 4.791462719629399e-06, + "logits/chosen": -2.3039164543151855, + "logits/rejected": -2.1613926887512207, + "logps/chosen": -183.3705596923828, + "logps/rejected": -171.16586303710938, + "loss": 0.6889, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.023545963689684868, + "rewards/margins": 0.10206764936447144, + "rewards/rejected": -0.07852168381214142, + "step": 3330 + }, + { + "epoch": 0.22, + "learning_rate": 4.789173696156212e-06, + "logits/chosen": -2.320606231689453, + "logits/rejected": -1.9661529064178467, + "logps/chosen": -271.17156982421875, + "logps/rejected": -267.260009765625, + "loss": 0.6869, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05269026756286621, + "rewards/margins": 0.14737890660762787, + "rewards/rejected": -0.09468863904476166, + "step": 3340 + }, + { + "epoch": 0.22, + "learning_rate": 4.786872731231662e-06, + "logits/chosen": -2.3447282314300537, + "logits/rejected": -2.2217040061950684, + "logps/chosen": -214.87109375, + "logps/rejected": -212.9879608154297, + "loss": 0.6912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02019861713051796, + "rewards/margins": 0.09485939145088196, + "rewards/rejected": -0.0746607705950737, + "step": 3350 + }, + { + "epoch": 0.22, + "learning_rate": 4.784559836858709e-06, + "logits/chosen": -2.318398952484131, + "logits/rejected": -1.8477122783660889, + "logps/chosen": -234.3660888671875, + "logps/rejected": -210.38357543945312, + "loss": 0.6908, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.004489635583013296, + "rewards/margins": 0.06438425183296204, + "rewards/rejected": -0.06887389719486237, + "step": 3360 + }, + { + "epoch": 0.22, + "learning_rate": 4.782235025102542e-06, + "logits/chosen": -2.312790632247925, + "logits/rejected": -2.242957830429077, + "logps/chosen": -228.43661499023438, + "logps/rejected": -220.25283813476562, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.021173015236854553, + "rewards/margins": 0.092967689037323, + "rewards/rejected": -0.11414071172475815, + "step": 3370 + }, + { + "epoch": 0.22, + "learning_rate": 4.779898308090519e-06, + "logits/chosen": -2.2664966583251953, + "logits/rejected": -2.0547492504119873, + "logps/chosen": -278.0267333984375, + "logps/rejected": -248.862548828125, + "loss": 0.689, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03349475562572479, + "rewards/margins": 0.07666581869125366, + "rewards/rejected": -0.11016058921813965, + "step": 3380 + }, + { + "epoch": 0.22, + "learning_rate": 4.777549698012101e-06, + "logits/chosen": -2.216127395629883, + "logits/rejected": -2.0575273036956787, + "logps/chosen": -244.1732177734375, + "logps/rejected": -233.697265625, + "loss": 0.6902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.009566160850226879, + "rewards/margins": 0.08683688193559647, + "rewards/rejected": -0.09640304744243622, + "step": 3390 + }, + { + "epoch": 0.22, + "learning_rate": 4.775189207118787e-06, + "logits/chosen": -2.2499351501464844, + "logits/rejected": -2.054161548614502, + "logps/chosen": -271.6486511230469, + "logps/rejected": -258.4850158691406, + "loss": 0.6908, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0016902908682823181, + "rewards/margins": 0.09174026548862457, + "rewards/rejected": -0.09004998207092285, + "step": 3400 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -2.3067705631256104, + "eval_logits/rejected": -2.121011734008789, + "eval_logps/chosen": -233.02999877929688, + "eval_logps/rejected": -220.44912719726562, + "eval_loss": 0.6904054880142212, + "eval_rewards/accuracies": 0.6345000267028809, + "eval_rewards/chosen": -0.010250742547214031, + "eval_rewards/margins": 0.07812146842479706, + "eval_rewards/rejected": -0.08837221562862396, + "eval_runtime": 711.3718, + "eval_samples_per_second": 2.811, + "eval_steps_per_second": 1.406, + "step": 3400 + }, + { + "epoch": 0.22, + "learning_rate": 4.772816847724054e-06, + "logits/chosen": -2.405539035797119, + "logits/rejected": -2.133857011795044, + "logps/chosen": -224.6778106689453, + "logps/rejected": -227.91207885742188, + "loss": 0.6923, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.021232225000858307, + "rewards/margins": 0.03736606985330582, + "rewards/rejected": -0.058598291128873825, + "step": 3410 + }, + { + "epoch": 0.22, + "learning_rate": 4.770432632203294e-06, + "logits/chosen": -2.139242649078369, + "logits/rejected": -2.053284168243408, + "logps/chosen": -248.99319458007812, + "logps/rejected": -202.07363891601562, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.053460635244846344, + "rewards/margins": 0.03398740664124489, + "rewards/rejected": -0.08744804561138153, + "step": 3420 + }, + { + "epoch": 0.22, + "learning_rate": 4.768036572993738e-06, + "logits/chosen": -2.1945345401763916, + "logits/rejected": -2.288442611694336, + "logps/chosen": -285.4275207519531, + "logps/rejected": -275.6911315917969, + "loss": 0.6901, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.054085589945316315, + "rewards/margins": 0.05863531306385994, + "rewards/rejected": -0.11272089183330536, + "step": 3430 + }, + { + "epoch": 0.23, + "learning_rate": 4.765628682594409e-06, + "logits/chosen": -2.3740832805633545, + "logits/rejected": -2.1946797370910645, + "logps/chosen": -246.08438110351562, + "logps/rejected": -229.9820098876953, + "loss": 0.689, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.023272987455129623, + "rewards/margins": 0.07800062000751495, + "rewards/rejected": -0.10127361863851547, + "step": 3440 + }, + { + "epoch": 0.23, + "learning_rate": 4.763208973566041e-06, + "logits/chosen": -2.185068130493164, + "logits/rejected": -2.204409122467041, + "logps/chosen": -187.07476806640625, + "logps/rejected": -213.8040771484375, + "loss": 0.6903, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.029450953006744385, + "rewards/margins": 0.08014042675495148, + "rewards/rejected": -0.10959136486053467, + "step": 3450 + }, + { + "epoch": 0.23, + "learning_rate": 4.76077745853102e-06, + "logits/chosen": -2.4352564811706543, + "logits/rejected": -2.2802255153656006, + "logps/chosen": -252.4907684326172, + "logps/rejected": -262.39654541015625, + "loss": 0.6912, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.024366283789277077, + "rewards/margins": 0.08780606091022491, + "rewards/rejected": -0.11217234283685684, + "step": 3460 + }, + { + "epoch": 0.23, + "learning_rate": 4.758334150173322e-06, + "logits/chosen": -2.3180832862854004, + "logits/rejected": -2.12862491607666, + "logps/chosen": -261.38922119140625, + "logps/rejected": -242.67416381835938, + "loss": 0.693, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.01694723591208458, + "rewards/margins": 0.06338542699813843, + "rewards/rejected": -0.04643818736076355, + "step": 3470 + }, + { + "epoch": 0.23, + "learning_rate": 4.755879061238439e-06, + "logits/chosen": -2.39463472366333, + "logits/rejected": -2.1686863899230957, + "logps/chosen": -254.03067016601562, + "logps/rejected": -246.9172821044922, + "loss": 0.6905, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02142667956650257, + "rewards/margins": 0.05523737147450447, + "rewards/rejected": -0.03381068632006645, + "step": 3480 + }, + { + "epoch": 0.23, + "learning_rate": 4.753412204533317e-06, + "logits/chosen": -2.5263428688049316, + "logits/rejected": -2.0499589443206787, + "logps/chosen": -260.4190673828125, + "logps/rejected": -223.6571502685547, + "loss": 0.689, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.024389993399381638, + "rewards/margins": 0.0862947553396225, + "rewards/rejected": -0.061904750764369965, + "step": 3490 + }, + { + "epoch": 0.23, + "learning_rate": 4.750933592926292e-06, + "logits/chosen": -2.398818254470825, + "logits/rejected": -2.0742526054382324, + "logps/chosen": -217.53012084960938, + "logps/rejected": -198.85635375976562, + "loss": 0.6901, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.011975173838436604, + "rewards/margins": 0.08542615175247192, + "rewards/rejected": -0.073450967669487, + "step": 3500 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -2.3047008514404297, + "eval_logits/rejected": -2.119340419769287, + "eval_logps/chosen": -230.0756072998047, + "eval_logps/rejected": -217.86997985839844, + "eval_loss": 0.6902625560760498, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": 0.019293660297989845, + "eval_rewards/margins": 0.0818745344877243, + "eval_rewards/rejected": -0.06258086860179901, + "eval_runtime": 712.428, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 3500 + }, + { + "epoch": 0.23, + "learning_rate": 4.7484432393470124e-06, + "logits/chosen": -2.486417531967163, + "logits/rejected": -1.9664733409881592, + "logps/chosen": -199.4810028076172, + "logps/rejected": -157.95590209960938, + "loss": 0.6836, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.026009265333414078, + "rewards/margins": 0.13871736824512482, + "rewards/rejected": -0.11270810663700104, + "step": 3510 + }, + { + "epoch": 0.23, + "learning_rate": 4.745941156786385e-06, + "logits/chosen": -2.092363119125366, + "logits/rejected": -2.062434434890747, + "logps/chosen": -150.09767150878906, + "logps/rejected": -195.23306274414062, + "loss": 0.6845, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.029414480552077293, + "rewards/margins": 0.1429421305656433, + "rewards/rejected": -0.11352765560150146, + "step": 3520 + }, + { + "epoch": 0.23, + "learning_rate": 4.743427358296497e-06, + "logits/chosen": -2.2270138263702393, + "logits/rejected": -2.030658483505249, + "logps/chosen": -187.15162658691406, + "logps/rejected": -217.05062866210938, + "loss": 0.6858, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025566350668668747, + "rewards/margins": 0.18591654300689697, + "rewards/rejected": -0.16035018861293793, + "step": 3530 + }, + { + "epoch": 0.23, + "learning_rate": 4.740901856990553e-06, + "logits/chosen": -2.149793863296509, + "logits/rejected": -1.9605810642242432, + "logps/chosen": -255.32052612304688, + "logps/rejected": -219.46920776367188, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.013713860884308815, + "rewards/margins": 0.08221259713172913, + "rewards/rejected": -0.06849874556064606, + "step": 3540 + }, + { + "epoch": 0.23, + "learning_rate": 4.738364666042804e-06, + "logits/chosen": -2.3816933631896973, + "logits/rejected": -1.9584366083145142, + "logps/chosen": -286.97796630859375, + "logps/rejected": -231.30648803710938, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0476701520383358, + "rewards/margins": 0.07968376576900482, + "rewards/rejected": -0.03201361373066902, + "step": 3550 + }, + { + "epoch": 0.23, + "learning_rate": 4.735815798688483e-06, + "logits/chosen": -2.3232216835021973, + "logits/rejected": -2.1071863174438477, + "logps/chosen": -194.87954711914062, + "logps/rejected": -224.50369262695312, + "loss": 0.6873, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03452432155609131, + "rewards/margins": 0.09485017508268356, + "rewards/rejected": -0.06032586842775345, + "step": 3560 + }, + { + "epoch": 0.23, + "learning_rate": 4.7332552682237285e-06, + "logits/chosen": -2.3406100273132324, + "logits/rejected": -1.8915197849273682, + "logps/chosen": -169.45468139648438, + "logps/rejected": -163.56141662597656, + "loss": 0.6888, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.043052736669778824, + "rewards/margins": 0.09879221022129059, + "rewards/rejected": -0.05573946237564087, + "step": 3570 + }, + { + "epoch": 0.23, + "learning_rate": 4.7306830880055234e-06, + "logits/chosen": -2.3042500019073486, + "logits/rejected": -2.2339184284210205, + "logps/chosen": -191.6345672607422, + "logps/rejected": -207.58676147460938, + "loss": 0.6895, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00900744367390871, + "rewards/margins": 0.07615131884813309, + "rewards/rejected": -0.08515877276659012, + "step": 3580 + }, + { + "epoch": 0.23, + "learning_rate": 4.728099271451619e-06, + "logits/chosen": -2.341984510421753, + "logits/rejected": -2.2418646812438965, + "logps/chosen": -190.93679809570312, + "logps/rejected": -190.25389099121094, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0007123596733435988, + "rewards/margins": 0.06269621104001999, + "rewards/rejected": -0.0619838610291481, + "step": 3590 + }, + { + "epoch": 0.24, + "learning_rate": 4.725503832040466e-06, + "logits/chosen": -2.1511435508728027, + "logits/rejected": -2.1525609493255615, + "logps/chosen": -148.30784606933594, + "logps/rejected": -181.79171752929688, + "loss": 0.6913, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.008500255644321442, + "rewards/margins": 0.07396461069583893, + "rewards/rejected": -0.06546434760093689, + "step": 3600 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.3040733337402344, + "eval_logits/rejected": -2.11887264251709, + "eval_logps/chosen": -230.52880859375, + "eval_logps/rejected": -218.516357421875, + "eval_loss": 0.6901616454124451, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": 0.014761154539883137, + "eval_rewards/margins": 0.08380559831857681, + "eval_rewards/rejected": -0.06904443353414536, + "eval_runtime": 712.8273, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 3600 + }, + { + "epoch": 0.24, + "learning_rate": 4.722896783311152e-06, + "logits/chosen": -2.282073497772217, + "logits/rejected": -2.17645263671875, + "logps/chosen": -259.38861083984375, + "logps/rejected": -316.8056640625, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003396064043045044, + "rewards/margins": 0.06283075362443924, + "rewards/rejected": -0.06622681021690369, + "step": 3610 + }, + { + "epoch": 0.24, + "learning_rate": 4.720278138863318e-06, + "logits/chosen": -2.4280340671539307, + "logits/rejected": -2.218613862991333, + "logps/chosen": -190.4235382080078, + "logps/rejected": -164.82579040527344, + "loss": 0.6921, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003018149407580495, + "rewards/margins": 0.0620940737426281, + "rewards/rejected": -0.06511221826076508, + "step": 3620 + }, + { + "epoch": 0.24, + "learning_rate": 4.717647912357095e-06, + "logits/chosen": -2.361996650695801, + "logits/rejected": -2.448129892349243, + "logps/chosen": -275.29071044921875, + "logps/rejected": -289.21759033203125, + "loss": 0.6921, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.02813585475087166, + "rewards/margins": 0.009944294579327106, + "rewards/rejected": -0.03808014467358589, + "step": 3630 + }, + { + "epoch": 0.24, + "learning_rate": 4.715006117513035e-06, + "logits/chosen": -2.460373640060425, + "logits/rejected": -2.220986843109131, + "logps/chosen": -321.8302307128906, + "logps/rejected": -273.5966796875, + "loss": 0.6906, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.022954054176807404, + "rewards/margins": 0.06946495175361633, + "rewards/rejected": -0.04651089757680893, + "step": 3640 + }, + { + "epoch": 0.24, + "learning_rate": 4.7123527681120326e-06, + "logits/chosen": -2.275266647338867, + "logits/rejected": -2.134054660797119, + "logps/chosen": -247.04855346679688, + "logps/rejected": -226.111328125, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0012259014183655381, + "rewards/margins": 0.07214462757110596, + "rewards/rejected": -0.0709187239408493, + "step": 3650 + }, + { + "epoch": 0.24, + "learning_rate": 4.7096878779952594e-06, + "logits/chosen": -2.357933759689331, + "logits/rejected": -2.303584575653076, + "logps/chosen": -275.9615783691406, + "logps/rejected": -278.7004089355469, + "loss": 0.6923, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005482043139636517, + "rewards/margins": 0.05321590229868889, + "rewards/rejected": -0.058697957545518875, + "step": 3660 + }, + { + "epoch": 0.24, + "learning_rate": 4.707011461064086e-06, + "logits/chosen": -2.159414768218994, + "logits/rejected": -1.9229214191436768, + "logps/chosen": -308.0876770019531, + "logps/rejected": -274.2186584472656, + "loss": 0.6905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0077134473249316216, + "rewards/margins": 0.10321645438671112, + "rewards/rejected": -0.09550300985574722, + "step": 3670 + }, + { + "epoch": 0.24, + "learning_rate": 4.704323531280016e-06, + "logits/chosen": -2.2135162353515625, + "logits/rejected": -2.040491819381714, + "logps/chosen": -324.78515625, + "logps/rejected": -248.89889526367188, + "loss": 0.6895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02426346018910408, + "rewards/margins": 0.07309317588806152, + "rewards/rejected": -0.048829711973667145, + "step": 3680 + }, + { + "epoch": 0.24, + "learning_rate": 4.701624102664606e-06, + "logits/chosen": -2.370241165161133, + "logits/rejected": -2.0312302112579346, + "logps/chosen": -262.2061767578125, + "logps/rejected": -215.11416625976562, + "loss": 0.6894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.010902756825089455, + "rewards/margins": 0.07008221745491028, + "rewards/rejected": -0.08098497986793518, + "step": 3690 + }, + { + "epoch": 0.24, + "learning_rate": 4.698913189299399e-06, + "logits/chosen": -2.2025485038757324, + "logits/rejected": -2.3091206550598145, + "logps/chosen": -187.55035400390625, + "logps/rejected": -225.8077850341797, + "loss": 0.694, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.013985480181872845, + "rewards/margins": 0.05750720947980881, + "rewards/rejected": -0.07149268686771393, + "step": 3700 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.281970262527466, + "eval_logits/rejected": -2.0983099937438965, + "eval_logps/chosen": -234.87881469726562, + "eval_logps/rejected": -221.8666534423828, + "eval_loss": 0.6903954744338989, + "eval_rewards/accuracies": 0.6389999985694885, + "eval_rewards/chosen": -0.028738651424646378, + "eval_rewards/margins": 0.0738087072968483, + "eval_rewards/rejected": -0.10254734754562378, + "eval_runtime": 711.1172, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 3700 + }, + { + "epoch": 0.24, + "learning_rate": 4.696190805325847e-06, + "logits/chosen": -2.2970728874206543, + "logits/rejected": -2.1544101238250732, + "logps/chosen": -207.85110473632812, + "logps/rejected": -189.63479614257812, + "loss": 0.69, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.014722605235874653, + "rewards/margins": 0.08902369439601898, + "rewards/rejected": -0.10374629497528076, + "step": 3710 + }, + { + "epoch": 0.24, + "learning_rate": 4.693456964945239e-06, + "logits/chosen": -2.416215419769287, + "logits/rejected": -1.9415165185928345, + "logps/chosen": -298.4938049316406, + "logps/rejected": -207.4619598388672, + "loss": 0.6882, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.003882091958075762, + "rewards/margins": 0.10060401260852814, + "rewards/rejected": -0.10448610782623291, + "step": 3720 + }, + { + "epoch": 0.24, + "learning_rate": 4.6907116824186245e-06, + "logits/chosen": -2.3689780235290527, + "logits/rejected": -2.3212902545928955, + "logps/chosen": -226.6787109375, + "logps/rejected": -231.8772430419922, + "loss": 0.689, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.006488241255283356, + "rewards/margins": 0.05394769459962845, + "rewards/rejected": -0.047459445893764496, + "step": 3730 + }, + { + "epoch": 0.24, + "learning_rate": 4.687954972066742e-06, + "logits/chosen": -2.260472297668457, + "logits/rejected": -1.9865401983261108, + "logps/chosen": -227.1370086669922, + "logps/rejected": -220.00808715820312, + "loss": 0.6858, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.026792461052536964, + "rewards/margins": 0.1374204158782959, + "rewards/rejected": -0.11062794923782349, + "step": 3740 + }, + { + "epoch": 0.25, + "learning_rate": 4.685186848269944e-06, + "logits/chosen": -2.2468438148498535, + "logits/rejected": -2.11405873298645, + "logps/chosen": -209.9337615966797, + "logps/rejected": -178.09884643554688, + "loss": 0.6915, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01938585564494133, + "rewards/margins": 0.05544018745422363, + "rewards/rejected": -0.0360543318092823, + "step": 3750 + }, + { + "epoch": 0.25, + "learning_rate": 4.682407325468119e-06, + "logits/chosen": -2.323763370513916, + "logits/rejected": -1.967911720275879, + "logps/chosen": -214.25634765625, + "logps/rejected": -191.69644165039062, + "loss": 0.6884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03162650763988495, + "rewards/margins": 0.11069830507040024, + "rewards/rejected": -0.07907179743051529, + "step": 3760 + }, + { + "epoch": 0.25, + "learning_rate": 4.67961641816062e-06, + "logits/chosen": -2.301642417907715, + "logits/rejected": -2.097708225250244, + "logps/chosen": -271.3573303222656, + "logps/rejected": -236.2880096435547, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.044394414871931076, + "rewards/margins": 0.06260766088962555, + "rewards/rejected": -0.018213242292404175, + "step": 3770 + }, + { + "epoch": 0.25, + "learning_rate": 4.676814140906188e-06, + "logits/chosen": -2.180407762527466, + "logits/rejected": -2.048719882965088, + "logps/chosen": -237.28604125976562, + "logps/rejected": -216.0989227294922, + "loss": 0.6887, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.022501787170767784, + "rewards/margins": 0.09418568760156631, + "rewards/rejected": -0.07168390601873398, + "step": 3780 + }, + { + "epoch": 0.25, + "learning_rate": 4.674000508322872e-06, + "logits/chosen": -2.022406578063965, + "logits/rejected": -2.098022222518921, + "logps/chosen": -214.1189727783203, + "logps/rejected": -229.61483764648438, + "loss": 0.6918, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0341593436896801, + "rewards/margins": 0.06312253326177597, + "rewards/rejected": -0.02896319329738617, + "step": 3790 + }, + { + "epoch": 0.25, + "learning_rate": 4.671175535087959e-06, + "logits/chosen": -2.194871664047241, + "logits/rejected": -2.153036117553711, + "logps/chosen": -285.90673828125, + "logps/rejected": -293.92242431640625, + "loss": 0.6891, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.05420888587832451, + "rewards/margins": 0.11247305572032928, + "rewards/rejected": -0.05826416611671448, + "step": 3800 + }, + { + "epoch": 0.25, + "eval_logits/chosen": -2.2757647037506104, + "eval_logits/rejected": -2.092339277267456, + "eval_logps/chosen": -227.5012969970703, + "eval_logps/rejected": -213.98056030273438, + "eval_loss": 0.6902437806129456, + "eval_rewards/accuracies": 0.6320000290870667, + "eval_rewards/chosen": 0.04503653571009636, + "eval_rewards/margins": 0.06872312724590302, + "eval_rewards/rejected": -0.023686589673161507, + "eval_runtime": 711.0018, + "eval_samples_per_second": 2.813, + "eval_steps_per_second": 1.406, + "step": 3800 + }, + { + "epoch": 0.25, + "learning_rate": 4.6683392359378924e-06, + "logits/chosen": -2.1588714122772217, + "logits/rejected": -1.9822829961776733, + "logps/chosen": -231.51382446289062, + "logps/rejected": -211.3966827392578, + "loss": 0.6913, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03635237738490105, + "rewards/margins": 0.05399390310049057, + "rewards/rejected": -0.017641523852944374, + "step": 3810 + }, + { + "epoch": 0.25, + "learning_rate": 4.665491625668198e-06, + "logits/chosen": -2.074720859527588, + "logits/rejected": -2.1197152137756348, + "logps/chosen": -153.5623321533203, + "logps/rejected": -185.47169494628906, + "loss": 0.6873, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02684941329061985, + "rewards/margins": 0.08354301750659943, + "rewards/rejected": -0.05669360235333443, + "step": 3820 + }, + { + "epoch": 0.25, + "learning_rate": 4.662632719133407e-06, + "logits/chosen": -2.3514442443847656, + "logits/rejected": -2.0640666484832764, + "logps/chosen": -225.99966430664062, + "logps/rejected": -167.13034057617188, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.047978900372982025, + "rewards/margins": 0.08138148486614227, + "rewards/rejected": -0.03340258076786995, + "step": 3830 + }, + { + "epoch": 0.25, + "learning_rate": 4.659762531246974e-06, + "logits/chosen": -2.2433078289031982, + "logits/rejected": -2.0894062519073486, + "logps/chosen": -214.85104370117188, + "logps/rejected": -186.99905395507812, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0027232493739575148, + "rewards/margins": 0.061046671122312546, + "rewards/rejected": -0.06376992166042328, + "step": 3840 + }, + { + "epoch": 0.25, + "learning_rate": 4.656881076981207e-06, + "logits/chosen": -2.3131306171417236, + "logits/rejected": -2.1745781898498535, + "logps/chosen": -212.8335418701172, + "logps/rejected": -200.85433959960938, + "loss": 0.6912, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.003498140024021268, + "rewards/margins": 0.059195131063461304, + "rewards/rejected": -0.055697001516819, + "step": 3850 + }, + { + "epoch": 0.25, + "learning_rate": 4.653988371367183e-06, + "logits/chosen": -2.2689290046691895, + "logits/rejected": -2.0051369667053223, + "logps/chosen": -239.0817413330078, + "logps/rejected": -183.85263061523438, + "loss": 0.6928, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01722235046327114, + "rewards/margins": 0.05978889390826225, + "rewards/rejected": -0.04256654158234596, + "step": 3860 + }, + { + "epoch": 0.25, + "learning_rate": 4.651084429494671e-06, + "logits/chosen": -2.3513553142547607, + "logits/rejected": -2.0689337253570557, + "logps/chosen": -272.21990966796875, + "logps/rejected": -197.9673614501953, + "loss": 0.6918, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0313107892870903, + "rewards/margins": 0.052712440490722656, + "rewards/rejected": -0.021401654928922653, + "step": 3870 + }, + { + "epoch": 0.25, + "learning_rate": 4.648169266512053e-06, + "logits/chosen": -2.4198365211486816, + "logits/rejected": -2.141869068145752, + "logps/chosen": -219.36965942382812, + "logps/rejected": -180.29354858398438, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0595845952630043, + "rewards/margins": 0.058780230581760406, + "rewards/rejected": 0.0008043628185987473, + "step": 3880 + }, + { + "epoch": 0.25, + "learning_rate": 4.6452428976262505e-06, + "logits/chosen": -2.23230242729187, + "logits/rejected": -2.0137524604797363, + "logps/chosen": -199.0391387939453, + "logps/rejected": -166.9921112060547, + "loss": 0.6893, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04752303659915924, + "rewards/margins": 0.1227576732635498, + "rewards/rejected": -0.07523464411497116, + "step": 3890 + }, + { + "epoch": 0.26, + "learning_rate": 4.642305338102633e-06, + "logits/chosen": -2.2830934524536133, + "logits/rejected": -2.3560256958007812, + "logps/chosen": -158.4663543701172, + "logps/rejected": -184.8391876220703, + "loss": 0.6877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.032832108438014984, + "rewards/margins": 0.08227143436670303, + "rewards/rejected": -0.04943932965397835, + "step": 3900 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.2935938835144043, + "eval_logits/rejected": -2.108933210372925, + "eval_logps/chosen": -229.80093383789062, + "eval_logps/rejected": -217.31515502929688, + "eval_loss": 0.6902133822441101, + "eval_rewards/accuracies": 0.6244999766349792, + "eval_rewards/chosen": 0.02204015851020813, + "eval_rewards/margins": 0.07907257974147797, + "eval_rewards/rejected": -0.05703242868185043, + "eval_runtime": 711.7595, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 3900 + }, + { + "epoch": 0.26, + "learning_rate": 4.639356603264953e-06, + "logits/chosen": -2.338958263397217, + "logits/rejected": -2.1034774780273438, + "logps/chosen": -240.74267578125, + "logps/rejected": -219.4384307861328, + "loss": 0.6924, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.010891283862292767, + "rewards/margins": 0.04426593333482742, + "rewards/rejected": -0.03337464481592178, + "step": 3910 + }, + { + "epoch": 0.26, + "learning_rate": 4.636396708495255e-06, + "logits/chosen": -2.1757044792175293, + "logits/rejected": -2.1606650352478027, + "logps/chosen": -225.34707641601562, + "logps/rejected": -206.83816528320312, + "loss": 0.6914, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03848005831241608, + "rewards/margins": 0.06334998458623886, + "rewards/rejected": -0.024869924411177635, + "step": 3920 + }, + { + "epoch": 0.26, + "learning_rate": 4.633425669233799e-06, + "logits/chosen": -2.274425983428955, + "logits/rejected": -2.2967169284820557, + "logps/chosen": -230.49679565429688, + "logps/rejected": -234.1136474609375, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04329800605773926, + "rewards/margins": 0.08064167201519012, + "rewards/rejected": -0.03734365105628967, + "step": 3930 + }, + { + "epoch": 0.26, + "learning_rate": 4.6304435009789825e-06, + "logits/chosen": -2.2880218029022217, + "logits/rejected": -2.0706839561462402, + "logps/chosen": -233.8865509033203, + "logps/rejected": -172.7992401123047, + "loss": 0.6893, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.03620678931474686, + "rewards/margins": 0.09822587668895721, + "rewards/rejected": -0.062019091099500656, + "step": 3940 + }, + { + "epoch": 0.26, + "learning_rate": 4.627450219287256e-06, + "logits/chosen": -2.3368616104125977, + "logits/rejected": -2.1908602714538574, + "logps/chosen": -177.8789825439453, + "logps/rejected": -161.35159301757812, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03680109232664108, + "rewards/margins": 0.05983690172433853, + "rewards/rejected": -0.02303580567240715, + "step": 3950 + }, + { + "epoch": 0.26, + "learning_rate": 4.624445839773042e-06, + "logits/chosen": -2.2832131385803223, + "logits/rejected": -2.2111704349517822, + "logps/chosen": -169.68849182128906, + "logps/rejected": -171.02833557128906, + "loss": 0.6922, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.003228194313123822, + "rewards/margins": 0.024037057533860207, + "rewards/rejected": -0.020808864384889603, + "step": 3960 + }, + { + "epoch": 0.26, + "learning_rate": 4.621430378108656e-06, + "logits/chosen": -2.264580249786377, + "logits/rejected": -2.1098790168762207, + "logps/chosen": -257.55718994140625, + "logps/rejected": -260.2860412597656, + "loss": 0.689, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.002011381322517991, + "rewards/margins": 0.09493207186460495, + "rewards/rejected": -0.092920683324337, + "step": 3970 + }, + { + "epoch": 0.26, + "learning_rate": 4.618403850024223e-06, + "logits/chosen": -2.1793527603149414, + "logits/rejected": -1.9493013620376587, + "logps/chosen": -256.003173828125, + "logps/rejected": -215.7708282470703, + "loss": 0.691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.005130067467689514, + "rewards/margins": 0.06429970264434814, + "rewards/rejected": -0.06942977011203766, + "step": 3980 + }, + { + "epoch": 0.26, + "learning_rate": 4.615366271307598e-06, + "logits/chosen": -2.3207273483276367, + "logits/rejected": -2.164661407470703, + "logps/chosen": -196.49105834960938, + "logps/rejected": -191.2655029296875, + "loss": 0.6906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.045480918139219284, + "rewards/margins": 0.06666766852140427, + "rewards/rejected": -0.11214858293533325, + "step": 3990 + }, + { + "epoch": 0.26, + "learning_rate": 4.612317657804277e-06, + "logits/chosen": -2.1907215118408203, + "logits/rejected": -2.2445192337036133, + "logps/chosen": -149.4791259765625, + "logps/rejected": -210.52490234375, + "loss": 0.6884, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.02246091142296791, + "rewards/margins": 0.09697895497083664, + "rewards/rejected": -0.11943986266851425, + "step": 4000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.2913191318511963, + "eval_logits/rejected": -2.106405258178711, + "eval_logps/chosen": -232.1314697265625, + "eval_logps/rejected": -219.69049072265625, + "eval_loss": 0.6901000738143921, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": -0.0012654466554522514, + "eval_rewards/margins": 0.07952029258012772, + "eval_rewards/rejected": -0.08078574389219284, + "eval_runtime": 712.6087, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.403, + "step": 4000 + }, + { + "epoch": 0.26, + "learning_rate": 4.6092580254173236e-06, + "logits/chosen": -2.1913225650787354, + "logits/rejected": -1.959183931350708, + "logps/chosen": -258.2712097167969, + "logps/rejected": -247.7600555419922, + "loss": 0.6902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.00948442704975605, + "rewards/margins": 0.08156983554363251, + "rewards/rejected": -0.09105426073074341, + "step": 4010 + }, + { + "epoch": 0.26, + "learning_rate": 4.606187390107277e-06, + "logits/chosen": -2.1633963584899902, + "logits/rejected": -1.9803078174591064, + "logps/chosen": -230.2688751220703, + "logps/rejected": -197.57717895507812, + "loss": 0.6914, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.053934670984745026, + "rewards/margins": 0.06301041692495346, + "rewards/rejected": -0.11694508790969849, + "step": 4020 + }, + { + "epoch": 0.26, + "learning_rate": 4.603105767892077e-06, + "logits/chosen": -2.264932632446289, + "logits/rejected": -2.1878082752227783, + "logps/chosen": -195.32559204101562, + "logps/rejected": -221.0668182373047, + "loss": 0.6907, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0063558658584952354, + "rewards/margins": 0.07787985354661942, + "rewards/rejected": -0.08423570543527603, + "step": 4030 + }, + { + "epoch": 0.26, + "learning_rate": 4.6000131748469725e-06, + "logits/chosen": -2.3445935249328613, + "logits/rejected": -1.9680637121200562, + "logps/chosen": -250.29660034179688, + "logps/rejected": -182.46685791015625, + "loss": 0.6903, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0015633717412129045, + "rewards/margins": 0.0648859366774559, + "rewards/rejected": -0.06644931435585022, + "step": 4040 + }, + { + "epoch": 0.26, + "learning_rate": 4.596909627104445e-06, + "logits/chosen": -2.3850934505462646, + "logits/rejected": -2.3055262565612793, + "logps/chosen": -251.54226684570312, + "logps/rejected": -226.3249053955078, + "loss": 0.6889, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.028039926663041115, + "rewards/margins": 0.07416818290948868, + "rewards/rejected": -0.10220811516046524, + "step": 4050 + }, + { + "epoch": 0.27, + "learning_rate": 4.5937951408541215e-06, + "logits/chosen": -2.433464527130127, + "logits/rejected": -1.9047530889511108, + "logps/chosen": -254.9823760986328, + "logps/rejected": -220.0203857421875, + "loss": 0.6904, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023438826203346252, + "rewards/margins": 0.10415074974298477, + "rewards/rejected": -0.12758956849575043, + "step": 4060 + }, + { + "epoch": 0.27, + "learning_rate": 4.590669732342685e-06, + "logits/chosen": -2.1566336154937744, + "logits/rejected": -2.012592315673828, + "logps/chosen": -213.21224975585938, + "logps/rejected": -226.32504272460938, + "loss": 0.6912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.010646522045135498, + "rewards/margins": 0.11220131814479828, + "rewards/rejected": -0.12284784018993378, + "step": 4070 + }, + { + "epoch": 0.27, + "learning_rate": 4.587533417873799e-06, + "logits/chosen": -2.2341346740722656, + "logits/rejected": -2.3223414421081543, + "logps/chosen": -195.48965454101562, + "logps/rejected": -263.8111267089844, + "loss": 0.6928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.023255977779626846, + "rewards/margins": 0.08617839962244034, + "rewards/rejected": -0.10943436622619629, + "step": 4080 + }, + { + "epoch": 0.27, + "learning_rate": 4.584386213808016e-06, + "logits/chosen": -2.2321903705596924, + "logits/rejected": -1.8963381052017212, + "logps/chosen": -222.75430297851562, + "logps/rejected": -183.17471313476562, + "loss": 0.6896, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01075290609151125, + "rewards/margins": 0.061965636909008026, + "rewards/rejected": -0.072718545794487, + "step": 4090 + }, + { + "epoch": 0.27, + "learning_rate": 4.581228136562693e-06, + "logits/chosen": -2.122157096862793, + "logits/rejected": -2.2352585792541504, + "logps/chosen": -239.14389038085938, + "logps/rejected": -216.71829223632812, + "loss": 0.693, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.009634166955947876, + "rewards/margins": 0.03259027749300003, + "rewards/rejected": -0.042224448174238205, + "step": 4100 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -2.3034896850585938, + "eval_logits/rejected": -2.117652654647827, + "eval_logps/chosen": -233.33334350585938, + "eval_logps/rejected": -219.198486328125, + "eval_loss": 0.6903753876686096, + "eval_rewards/accuracies": 0.628000020980835, + "eval_rewards/chosen": -0.013284044340252876, + "eval_rewards/margins": 0.06258184462785721, + "eval_rewards/rejected": -0.07586588710546494, + "eval_runtime": 713.5236, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.401, + "step": 4100 + }, + { + "epoch": 0.27, + "learning_rate": 4.578059202611909e-06, + "logits/chosen": -2.3259823322296143, + "logits/rejected": -2.100602149963379, + "logps/chosen": -256.8636779785156, + "logps/rejected": -246.4088897705078, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.002952608745545149, + "rewards/margins": 0.0430120974779129, + "rewards/rejected": -0.04005948826670647, + "step": 4110 + }, + { + "epoch": 0.27, + "learning_rate": 4.574879428486376e-06, + "logits/chosen": -2.3123717308044434, + "logits/rejected": -2.031857967376709, + "logps/chosen": -214.11416625976562, + "logps/rejected": -217.881591796875, + "loss": 0.6911, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011976310983300209, + "rewards/margins": 0.06689594686031342, + "rewards/rejected": -0.07887225598096848, + "step": 4120 + }, + { + "epoch": 0.27, + "learning_rate": 4.571688830773352e-06, + "logits/chosen": -2.3346524238586426, + "logits/rejected": -2.226778507232666, + "logps/chosen": -223.7250213623047, + "logps/rejected": -206.4811553955078, + "loss": 0.6918, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016568060964345932, + "rewards/margins": 0.02635127305984497, + "rewards/rejected": -0.0429193340241909, + "step": 4130 + }, + { + "epoch": 0.27, + "learning_rate": 4.568487426116559e-06, + "logits/chosen": -2.2562003135681152, + "logits/rejected": -2.2672030925750732, + "logps/chosen": -172.1905059814453, + "logps/rejected": -168.9148712158203, + "loss": 0.6929, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0021867998875677586, + "rewards/margins": 0.04257757216691971, + "rewards/rejected": -0.04476437345147133, + "step": 4140 + }, + { + "epoch": 0.27, + "learning_rate": 4.565275231216092e-06, + "logits/chosen": -2.171159267425537, + "logits/rejected": -2.1469078063964844, + "logps/chosen": -151.336669921875, + "logps/rejected": -201.35855102539062, + "loss": 0.6907, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 6.799399852752686e-05, + "rewards/margins": 0.04031980410218239, + "rewards/rejected": -0.04025180637836456, + "step": 4150 + }, + { + "epoch": 0.27, + "learning_rate": 4.562052262828331e-06, + "logits/chosen": -2.2262353897094727, + "logits/rejected": -2.083603620529175, + "logps/chosen": -201.1298065185547, + "logps/rejected": -201.3617706298828, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02318784035742283, + "rewards/margins": 0.055079467594623566, + "rewards/rejected": -0.07826730608940125, + "step": 4160 + }, + { + "epoch": 0.27, + "learning_rate": 4.558818537765861e-06, + "logits/chosen": -2.4017839431762695, + "logits/rejected": -2.179560899734497, + "logps/chosen": -237.0455322265625, + "logps/rejected": -208.0402374267578, + "loss": 0.6922, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.010311352089047432, + "rewards/margins": 0.05862750858068466, + "rewards/rejected": -0.06893886625766754, + "step": 4170 + }, + { + "epoch": 0.27, + "learning_rate": 4.555574072897374e-06, + "logits/chosen": -2.3054423332214355, + "logits/rejected": -2.3093464374542236, + "logps/chosen": -202.890625, + "logps/rejected": -206.8453826904297, + "loss": 0.689, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.009293651208281517, + "rewards/margins": 0.062263913452625275, + "rewards/rejected": -0.07155755162239075, + "step": 4180 + }, + { + "epoch": 0.27, + "learning_rate": 4.552318885147589e-06, + "logits/chosen": -2.427234649658203, + "logits/rejected": -2.067701816558838, + "logps/chosen": -240.99063110351562, + "logps/rejected": -188.87326049804688, + "loss": 0.6911, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.000729889259673655, + "rewards/margins": 0.07272221148014069, + "rewards/rejected": -0.07345209270715714, + "step": 4190 + }, + { + "epoch": 0.27, + "learning_rate": 4.549052991497159e-06, + "logits/chosen": -2.283116102218628, + "logits/rejected": -2.254042387008667, + "logps/chosen": -181.29025268554688, + "logps/rejected": -188.34085083007812, + "loss": 0.691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.011230905540287495, + "rewards/margins": 0.06631726771593094, + "rewards/rejected": -0.07754816114902496, + "step": 4200 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -2.296299934387207, + "eval_logits/rejected": -2.1112263202667236, + "eval_logps/chosen": -232.25408935546875, + "eval_logps/rejected": -218.76133728027344, + "eval_loss": 0.6903825402259827, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": -0.002491473685950041, + "eval_rewards/margins": 0.0690029114484787, + "eval_rewards/rejected": -0.07149438560009003, + "eval_runtime": 711.9807, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 1.405, + "step": 4200 + }, + { + "epoch": 0.28, + "learning_rate": 4.545776408982585e-06, + "logits/chosen": -2.222346782684326, + "logits/rejected": -2.215831756591797, + "logps/chosen": -230.8057403564453, + "logps/rejected": -228.67105102539062, + "loss": 0.6895, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.009383128955960274, + "rewards/margins": 0.0687854140996933, + "rewards/rejected": -0.059402287006378174, + "step": 4210 + }, + { + "epoch": 0.28, + "learning_rate": 4.542489154696128e-06, + "logits/chosen": -2.435891628265381, + "logits/rejected": -2.0726494789123535, + "logps/chosen": -265.7242736816406, + "logps/rejected": -210.0156707763672, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.019960414618253708, + "rewards/margins": 0.05542059987783432, + "rewards/rejected": -0.035460181534290314, + "step": 4220 + }, + { + "epoch": 0.28, + "learning_rate": 4.5391912457857145e-06, + "logits/chosen": -2.3141016960144043, + "logits/rejected": -2.055931568145752, + "logps/chosen": -264.88525390625, + "logps/rejected": -223.41726684570312, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0008437506621703506, + "rewards/margins": 0.05641711503267288, + "rewards/rejected": -0.05557336285710335, + "step": 4230 + }, + { + "epoch": 0.28, + "learning_rate": 4.535882699454854e-06, + "logits/chosen": -2.3128061294555664, + "logits/rejected": -2.189279556274414, + "logps/chosen": -270.4936218261719, + "logps/rejected": -303.23992919921875, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005573967471718788, + "rewards/margins": 0.11083599179983139, + "rewards/rejected": -0.10526201874017715, + "step": 4240 + }, + { + "epoch": 0.28, + "learning_rate": 4.532563532962546e-06, + "logits/chosen": -2.368762493133545, + "logits/rejected": -2.450859546661377, + "logps/chosen": -191.6454315185547, + "logps/rejected": -218.98867797851562, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.017511827871203423, + "rewards/margins": 0.0574830062687397, + "rewards/rejected": -0.07499483227729797, + "step": 4250 + }, + { + "epoch": 0.28, + "learning_rate": 4.529233763623187e-06, + "logits/chosen": -2.328399896621704, + "logits/rejected": -2.034263849258423, + "logps/chosen": -203.28858947753906, + "logps/rejected": -163.58592224121094, + "loss": 0.6884, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.011880872771143913, + "rewards/margins": 0.08256997168064117, + "rewards/rejected": -0.09445084631443024, + "step": 4260 + }, + { + "epoch": 0.28, + "learning_rate": 4.5258934088064854e-06, + "logits/chosen": -2.2317817211151123, + "logits/rejected": -1.83087158203125, + "logps/chosen": -223.72305297851562, + "logps/rejected": -181.90640258789062, + "loss": 0.6868, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.037422824651002884, + "rewards/margins": 0.12880873680114746, + "rewards/rejected": -0.16623155772686005, + "step": 4270 + }, + { + "epoch": 0.28, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -2.3460514545440674, + "logits/rejected": -2.08577299118042, + "logps/chosen": -293.10992431640625, + "logps/rejected": -206.9127655029297, + "loss": 0.6889, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.024574220180511475, + "rewards/margins": 0.1044369488954544, + "rewards/rejected": -0.12901116907596588, + "step": 4280 + }, + { + "epoch": 0.28, + "learning_rate": 4.519181012495892e-06, + "logits/chosen": -2.3494815826416016, + "logits/rejected": -2.219589948654175, + "logps/chosen": -247.26657104492188, + "logps/rejected": -227.00888061523438, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05047302693128586, + "rewards/margins": 0.08816438913345337, + "rewards/rejected": -0.13863742351531982, + "step": 4290 + }, + { + "epoch": 0.28, + "learning_rate": 4.515809006017147e-06, + "logits/chosen": -2.274042844772339, + "logits/rejected": -1.9699468612670898, + "logps/chosen": -236.0398712158203, + "logps/rejected": -208.1054229736328, + "loss": 0.6904, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.026302605867385864, + "rewards/margins": 0.07574830204248428, + "rewards/rejected": -0.10205090045928955, + "step": 4300 + }, + { + "epoch": 0.28, + "eval_logits/chosen": -2.286620855331421, + "eval_logits/rejected": -2.1015021800994873, + "eval_logps/chosen": -235.3810272216797, + "eval_logps/rejected": -223.5635223388672, + "eval_loss": 0.6901422142982483, + "eval_rewards/accuracies": 0.6345000267028809, + "eval_rewards/chosen": -0.03376083821058273, + "eval_rewards/margins": 0.08575531840324402, + "eval_rewards/rejected": -0.11951615661382675, + "eval_runtime": 711.2011, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 4300 + }, + { + "epoch": 0.28, + "learning_rate": 4.512426484091171e-06, + "logits/chosen": -2.418959379196167, + "logits/rejected": -2.085226058959961, + "logps/chosen": -279.13177490234375, + "logps/rejected": -249.38064575195312, + "loss": 0.6927, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.015505967661738396, + "rewards/margins": 0.06541910022497177, + "rewards/rejected": -0.08092506229877472, + "step": 4310 + }, + { + "epoch": 0.28, + "learning_rate": 4.509033464362858e-06, + "logits/chosen": -2.122525215148926, + "logits/rejected": -2.1860134601593018, + "logps/chosen": -243.3289794921875, + "logps/rejected": -266.288330078125, + "loss": 0.6901, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.020630965009331703, + "rewards/margins": 0.08728428930044174, + "rewards/rejected": -0.10791525989770889, + "step": 4320 + }, + { + "epoch": 0.28, + "learning_rate": 4.505629964531857e-06, + "logits/chosen": -2.3952324390411377, + "logits/rejected": -2.192960500717163, + "logps/chosen": -226.1346435546875, + "logps/rejected": -203.77609252929688, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029302721843123436, + "rewards/margins": 0.08454638719558716, + "rewards/rejected": -0.11384911835193634, + "step": 4330 + }, + { + "epoch": 0.28, + "learning_rate": 4.502216002352492e-06, + "logits/chosen": -2.3942387104034424, + "logits/rejected": -2.1669986248016357, + "logps/chosen": -167.1390380859375, + "logps/rejected": -154.9490203857422, + "loss": 0.6914, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04098113626241684, + "rewards/margins": 0.07224190980195999, + "rewards/rejected": -0.11322303861379623, + "step": 4340 + }, + { + "epoch": 0.28, + "learning_rate": 4.498791595633663e-06, + "logits/chosen": -2.227745771408081, + "logits/rejected": -1.8462340831756592, + "logps/chosen": -265.2237854003906, + "logps/rejected": -183.4816131591797, + "loss": 0.6912, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.02839501202106476, + "rewards/margins": 0.054582733660936356, + "rewards/rejected": -0.08297775685787201, + "step": 4350 + }, + { + "epoch": 0.29, + "learning_rate": 4.495356762238751e-06, + "logits/chosen": -2.472080707550049, + "logits/rejected": -1.9949004650115967, + "logps/chosen": -284.7836608886719, + "logps/rejected": -195.12869262695312, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015857676044106483, + "rewards/margins": 0.07797005027532578, + "rewards/rejected": -0.09382772445678711, + "step": 4360 + }, + { + "epoch": 0.29, + "learning_rate": 4.491911520085532e-06, + "logits/chosen": -2.046393394470215, + "logits/rejected": -1.9404007196426392, + "logps/chosen": -202.2923126220703, + "logps/rejected": -221.1287078857422, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018845614045858383, + "rewards/margins": 0.08063776046037674, + "rewards/rejected": -0.09948337823152542, + "step": 4370 + }, + { + "epoch": 0.29, + "learning_rate": 4.488455887146075e-06, + "logits/chosen": -2.159259080886841, + "logits/rejected": -2.141447067260742, + "logps/chosen": -172.12095642089844, + "logps/rejected": -198.18894958496094, + "loss": 0.6879, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.020088694989681244, + "rewards/margins": 0.12904280424118042, + "rewards/rejected": -0.14913150668144226, + "step": 4380 + }, + { + "epoch": 0.29, + "learning_rate": 4.484989881446654e-06, + "logits/chosen": -2.4215025901794434, + "logits/rejected": -2.220041275024414, + "logps/chosen": -204.8853302001953, + "logps/rejected": -191.7876434326172, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024093201383948326, + "rewards/margins": 0.0484703965485096, + "rewards/rejected": -0.07256358861923218, + "step": 4390 + }, + { + "epoch": 0.29, + "learning_rate": 4.481513521067654e-06, + "logits/chosen": -2.3942711353302, + "logits/rejected": -2.039447546005249, + "logps/chosen": -228.5469512939453, + "logps/rejected": -200.28292846679688, + "loss": 0.6903, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03772415220737457, + "rewards/margins": 0.07151724398136139, + "rewards/rejected": -0.10924138873815536, + "step": 4400 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.292935609817505, + "eval_logits/rejected": -2.107675790786743, + "eval_logps/chosen": -236.54518127441406, + "eval_logps/rejected": -223.5493927001953, + "eval_loss": 0.6902163028717041, + "eval_rewards/accuracies": 0.6274999976158142, + "eval_rewards/chosen": -0.04540235176682472, + "eval_rewards/margins": 0.07397259771823883, + "eval_rewards/rejected": -0.11937494575977325, + "eval_runtime": 713.6438, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.401, + "step": 4400 + }, + { + "epoch": 0.29, + "learning_rate": 4.478026824143473e-06, + "logits/chosen": -2.3092598915100098, + "logits/rejected": -2.173832416534424, + "logps/chosen": -270.6745910644531, + "logps/rejected": -224.7923583984375, + "loss": 0.6862, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.046129751950502396, + "rewards/margins": 0.10685235261917114, + "rewards/rejected": -0.15298210084438324, + "step": 4410 + }, + { + "epoch": 0.29, + "learning_rate": 4.474529808862429e-06, + "logits/chosen": -2.197213649749756, + "logits/rejected": -2.124817371368408, + "logps/chosen": -193.43679809570312, + "logps/rejected": -218.94619750976562, + "loss": 0.6909, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04616478085517883, + "rewards/margins": 0.07885146141052246, + "rewards/rejected": -0.1250162422657013, + "step": 4420 + }, + { + "epoch": 0.29, + "learning_rate": 4.471022493466669e-06, + "logits/chosen": -2.3107991218566895, + "logits/rejected": -1.9796260595321655, + "logps/chosen": -303.61737060546875, + "logps/rejected": -232.7263641357422, + "loss": 0.6907, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.016782710328698158, + "rewards/margins": 0.06120295077562332, + "rewards/rejected": -0.07798566669225693, + "step": 4430 + }, + { + "epoch": 0.29, + "learning_rate": 4.467504896252066e-06, + "logits/chosen": -2.335106372833252, + "logits/rejected": -2.22440767288208, + "logps/chosen": -252.58322143554688, + "logps/rejected": -233.07406616210938, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.028819028288125992, + "rewards/margins": 0.09233070909976959, + "rewards/rejected": -0.12114973366260529, + "step": 4440 + }, + { + "epoch": 0.29, + "learning_rate": 4.463977035568132e-06, + "logits/chosen": -2.1951041221618652, + "logits/rejected": -2.431762933731079, + "logps/chosen": -214.28427124023438, + "logps/rejected": -269.91864013671875, + "loss": 0.691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.019015159457921982, + "rewards/margins": 0.039287667721509933, + "rewards/rejected": -0.058302827179431915, + "step": 4450 + }, + { + "epoch": 0.29, + "learning_rate": 4.460438929817914e-06, + "logits/chosen": -2.264540672302246, + "logits/rejected": -2.11928129196167, + "logps/chosen": -207.63388061523438, + "logps/rejected": -209.83316040039062, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012141216546297073, + "rewards/margins": 0.06487870216369629, + "rewards/rejected": -0.07701991498470306, + "step": 4460 + }, + { + "epoch": 0.29, + "learning_rate": 4.456890597457907e-06, + "logits/chosen": -2.112905979156494, + "logits/rejected": -2.159135341644287, + "logps/chosen": -216.1064453125, + "logps/rejected": -243.3882293701172, + "loss": 0.6893, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03970780223608017, + "rewards/margins": 0.09077353030443192, + "rewards/rejected": -0.13048133254051208, + "step": 4470 + }, + { + "epoch": 0.29, + "learning_rate": 4.453332056997951e-06, + "logits/chosen": -2.2369141578674316, + "logits/rejected": -2.2910995483398438, + "logps/chosen": -181.7244110107422, + "logps/rejected": -187.7960662841797, + "loss": 0.6883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01764925941824913, + "rewards/margins": 0.10396716743707657, + "rewards/rejected": -0.1216164231300354, + "step": 4480 + }, + { + "epoch": 0.29, + "learning_rate": 4.449763327001134e-06, + "logits/chosen": -2.2684309482574463, + "logits/rejected": -2.174893617630005, + "logps/chosen": -190.84078979492188, + "logps/rejected": -226.34326171875, + "loss": 0.6906, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.020785531029105186, + "rewards/margins": 0.07087056338787079, + "rewards/rejected": -0.09165609627962112, + "step": 4490 + }, + { + "epoch": 0.29, + "learning_rate": 4.446184426083702e-06, + "logits/chosen": -2.249093532562256, + "logits/rejected": -2.0277256965637207, + "logps/chosen": -195.96009826660156, + "logps/rejected": -216.97604370117188, + "loss": 0.6864, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.023957300931215286, + "rewards/margins": 0.12648364901542664, + "rewards/rejected": -0.15044096112251282, + "step": 4500 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.307412624359131, + "eval_logits/rejected": -2.1211140155792236, + "eval_logps/chosen": -234.31179809570312, + "eval_logps/rejected": -222.2449493408203, + "eval_loss": 0.6901082396507263, + "eval_rewards/accuracies": 0.6324999928474426, + "eval_rewards/chosen": -0.02306850627064705, + "eval_rewards/margins": 0.0832618772983551, + "eval_rewards/rejected": -0.10633040219545364, + "eval_runtime": 714.5639, + "eval_samples_per_second": 2.799, + "eval_steps_per_second": 1.399, + "step": 4500 + }, + { + "epoch": 0.3, + "learning_rate": 4.442595372914954e-06, + "logits/chosen": -2.3577396869659424, + "logits/rejected": -2.0909037590026855, + "logps/chosen": -236.86007690429688, + "logps/rejected": -160.77267456054688, + "loss": 0.6884, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0012024863390251994, + "rewards/margins": 0.09186828881502151, + "rewards/rejected": -0.090665802359581, + "step": 4510 + }, + { + "epoch": 0.3, + "learning_rate": 4.43899618621715e-06, + "logits/chosen": -2.303638458251953, + "logits/rejected": -2.097632884979248, + "logps/chosen": -254.14059448242188, + "logps/rejected": -269.0081481933594, + "loss": 0.6907, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.030867312103509903, + "rewards/margins": 0.1167084202170372, + "rewards/rejected": -0.1475757360458374, + "step": 4520 + }, + { + "epoch": 0.3, + "learning_rate": 4.4353868847654105e-06, + "logits/chosen": -2.4185733795166016, + "logits/rejected": -2.1811881065368652, + "logps/chosen": -244.586181640625, + "logps/rejected": -222.5398406982422, + "loss": 0.6862, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012720689177513123, + "rewards/margins": 0.08021236956119537, + "rewards/rejected": -0.06749166548252106, + "step": 4530 + }, + { + "epoch": 0.3, + "learning_rate": 4.43176748738762e-06, + "logits/chosen": -2.3359453678131104, + "logits/rejected": -2.09609055519104, + "logps/chosen": -233.65359497070312, + "logps/rejected": -248.21719360351562, + "loss": 0.6895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01705637015402317, + "rewards/margins": 0.10175220668315887, + "rewards/rejected": -0.1188085675239563, + "step": 4540 + }, + { + "epoch": 0.3, + "learning_rate": 4.4281380129643295e-06, + "logits/chosen": -2.2307958602905273, + "logits/rejected": -2.0683705806732178, + "logps/chosen": -229.425537109375, + "logps/rejected": -228.88040161132812, + "loss": 0.6899, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0032461367081850767, + "rewards/margins": 0.0992891788482666, + "rewards/rejected": -0.09604303538799286, + "step": 4550 + }, + { + "epoch": 0.3, + "learning_rate": 4.424498480428654e-06, + "logits/chosen": -2.258957862854004, + "logits/rejected": -2.15374755859375, + "logps/chosen": -249.0747528076172, + "logps/rejected": -211.0282745361328, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.018028225749731064, + "rewards/margins": 0.01978963240981102, + "rewards/rejected": -0.037817858159542084, + "step": 4560 + }, + { + "epoch": 0.3, + "learning_rate": 4.420848908766178e-06, + "logits/chosen": -2.3698325157165527, + "logits/rejected": -2.2980637550354004, + "logps/chosen": -206.7262420654297, + "logps/rejected": -220.7576141357422, + "loss": 0.69, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.005006049759685993, + "rewards/margins": 0.05151135474443436, + "rewards/rejected": -0.04650530219078064, + "step": 4570 + }, + { + "epoch": 0.3, + "learning_rate": 4.417189317014855e-06, + "logits/chosen": -2.2065937519073486, + "logits/rejected": -2.445247173309326, + "logps/chosen": -199.8828582763672, + "logps/rejected": -236.3675994873047, + "loss": 0.6903, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.009130073711276054, + "rewards/margins": 0.057878412306308746, + "rewards/rejected": -0.04874832555651665, + "step": 4580 + }, + { + "epoch": 0.3, + "learning_rate": 4.41351972426491e-06, + "logits/chosen": -2.129570484161377, + "logits/rejected": -2.161388874053955, + "logps/chosen": -248.59689331054688, + "logps/rejected": -309.24725341796875, + "loss": 0.6913, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.018575403839349747, + "rewards/margins": 0.06597335636615753, + "rewards/rejected": -0.08454876393079758, + "step": 4590 + }, + { + "epoch": 0.3, + "learning_rate": 4.409840149658735e-06, + "logits/chosen": -2.2294223308563232, + "logits/rejected": -1.970663070678711, + "logps/chosen": -284.4920654296875, + "logps/rejected": -242.57687377929688, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0002518877445254475, + "rewards/margins": 0.08236613124608994, + "rewards/rejected": -0.08211424201726913, + "step": 4600 + }, + { + "epoch": 0.3, + "eval_logits/chosen": -2.3078274726867676, + "eval_logits/rejected": -2.121540069580078, + "eval_logps/chosen": -231.3809051513672, + "eval_logps/rejected": -218.01165771484375, + "eval_loss": 0.6902089715003967, + "eval_rewards/accuracies": 0.6309999823570251, + "eval_rewards/chosen": 0.0062404475174844265, + "eval_rewards/margins": 0.07023809105157852, + "eval_rewards/rejected": -0.06399764865636826, + "eval_runtime": 711.6306, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 4600 + }, + { + "epoch": 0.3, + "learning_rate": 4.4061506123907925e-06, + "logits/chosen": -2.226529598236084, + "logits/rejected": -2.063323497772217, + "logps/chosen": -263.83251953125, + "logps/rejected": -228.8885040283203, + "loss": 0.6906, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0025929573457688093, + "rewards/margins": 0.05362165719270706, + "rewards/rejected": -0.051028698682785034, + "step": 4610 + }, + { + "epoch": 0.3, + "learning_rate": 4.402451131707519e-06, + "logits/chosen": -2.4300453662872314, + "logits/rejected": -1.9670915603637695, + "logps/chosen": -208.1881866455078, + "logps/rejected": -143.37477111816406, + "loss": 0.689, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.012772110290825367, + "rewards/margins": 0.1066797599196434, + "rewards/rejected": -0.09390763938426971, + "step": 4620 + }, + { + "epoch": 0.3, + "learning_rate": 4.398741726907215e-06, + "logits/chosen": -2.4595742225646973, + "logits/rejected": -2.141775608062744, + "logps/chosen": -277.5190734863281, + "logps/rejected": -244.45352172851562, + "loss": 0.6886, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.017383281141519547, + "rewards/margins": 0.07261139899492264, + "rewards/rejected": -0.05522811412811279, + "step": 4630 + }, + { + "epoch": 0.3, + "learning_rate": 4.395022417339955e-06, + "logits/chosen": -2.2033920288085938, + "logits/rejected": -2.2470412254333496, + "logps/chosen": -208.9228973388672, + "logps/rejected": -223.9864044189453, + "loss": 0.6913, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.022978752851486206, + "rewards/margins": 0.0657435953617096, + "rewards/rejected": -0.0887223556637764, + "step": 4640 + }, + { + "epoch": 0.3, + "learning_rate": 4.391293222407479e-06, + "logits/chosen": -2.30222487449646, + "logits/rejected": -2.303806781768799, + "logps/chosen": -136.2609405517578, + "logps/rejected": -160.14111328125, + "loss": 0.6907, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0029621024150401354, + "rewards/margins": 0.05650148540735245, + "rewards/rejected": -0.05353938415646553, + "step": 4650 + }, + { + "epoch": 0.3, + "learning_rate": 4.387554161563094e-06, + "logits/chosen": -2.3135313987731934, + "logits/rejected": -2.2245595455169678, + "logps/chosen": -200.64547729492188, + "logps/rejected": -199.0697479248047, + "loss": 0.6864, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.017812874168157578, + "rewards/margins": 0.10243946313858032, + "rewards/rejected": -0.1202523335814476, + "step": 4660 + }, + { + "epoch": 0.31, + "learning_rate": 4.383805254311575e-06, + "logits/chosen": -2.509479522705078, + "logits/rejected": -2.135781764984131, + "logps/chosen": -257.2196044921875, + "logps/rejected": -218.57418823242188, + "loss": 0.6896, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.010785650461912155, + "rewards/margins": 0.06920838356018066, + "rewards/rejected": -0.07999403774738312, + "step": 4670 + }, + { + "epoch": 0.31, + "learning_rate": 4.380046520209056e-06, + "logits/chosen": -2.3661510944366455, + "logits/rejected": -1.9948110580444336, + "logps/chosen": -202.7303924560547, + "logps/rejected": -186.60891723632812, + "loss": 0.6918, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.024640483781695366, + "rewards/margins": 0.07717674970626831, + "rewards/rejected": -0.10181725025177002, + "step": 4680 + }, + { + "epoch": 0.31, + "learning_rate": 4.376277978862936e-06, + "logits/chosen": -2.2196907997131348, + "logits/rejected": -1.9270433187484741, + "logps/chosen": -227.43930053710938, + "logps/rejected": -193.55284118652344, + "loss": 0.6912, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.014506752602756023, + "rewards/margins": 0.06645031273365021, + "rewards/rejected": -0.0809570699930191, + "step": 4690 + }, + { + "epoch": 0.31, + "learning_rate": 4.372499649931774e-06, + "logits/chosen": -2.1691818237304688, + "logits/rejected": -2.317289113998413, + "logps/chosen": -212.4759063720703, + "logps/rejected": -234.37451171875, + "loss": 0.6854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0338435061275959, + "rewards/margins": 0.13942193984985352, + "rewards/rejected": -0.1732654571533203, + "step": 4700 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.31925368309021, + "eval_logits/rejected": -2.1311213970184326, + "eval_logps/chosen": -235.55807495117188, + "eval_logps/rejected": -224.37208557128906, + "eval_loss": 0.6902915835380554, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": -0.03553127497434616, + "eval_rewards/margins": 0.09207045286893845, + "eval_rewards/rejected": -0.1276017278432846, + "eval_runtime": 714.2184, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 4700 + }, + { + "epoch": 0.31, + "learning_rate": 4.368711553125185e-06, + "logits/chosen": -2.5005226135253906, + "logits/rejected": -2.265688180923462, + "logps/chosen": -281.1730041503906, + "logps/rejected": -229.6286163330078, + "loss": 0.6925, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0398295521736145, + "rewards/margins": 0.06899070739746094, + "rewards/rejected": -0.10882025957107544, + "step": 4710 + }, + { + "epoch": 0.31, + "learning_rate": 4.364913708203734e-06, + "logits/chosen": -2.398655891418457, + "logits/rejected": -2.0503017902374268, + "logps/chosen": -289.25067138671875, + "logps/rejected": -222.9422149658203, + "loss": 0.6905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04354417696595192, + "rewards/margins": 0.09108763188123703, + "rewards/rejected": -0.13463182747364044, + "step": 4720 + }, + { + "epoch": 0.31, + "learning_rate": 4.361106134978844e-06, + "logits/chosen": -2.277704954147339, + "logits/rejected": -2.071712017059326, + "logps/chosen": -272.6820068359375, + "logps/rejected": -265.93670654296875, + "loss": 0.6922, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.022515593096613884, + "rewards/margins": 0.06205441802740097, + "rewards/rejected": -0.08457001298666, + "step": 4730 + }, + { + "epoch": 0.31, + "learning_rate": 4.357288853312681e-06, + "logits/chosen": -2.3490684032440186, + "logits/rejected": -2.26945161819458, + "logps/chosen": -287.81549072265625, + "logps/rejected": -287.1567687988281, + "loss": 0.692, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.04337679222226143, + "rewards/margins": 0.04078169912099838, + "rewards/rejected": -0.08415848016738892, + "step": 4740 + }, + { + "epoch": 0.31, + "learning_rate": 4.353461883118056e-06, + "logits/chosen": -2.249939203262329, + "logits/rejected": -2.112075090408325, + "logps/chosen": -232.1291961669922, + "logps/rejected": -214.3981475830078, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.02838616445660591, + "rewards/margins": 0.03420080989599228, + "rewards/rejected": -0.06258697807788849, + "step": 4750 + }, + { + "epoch": 0.31, + "learning_rate": 4.34962524435832e-06, + "logits/chosen": -2.1387410163879395, + "logits/rejected": -2.0375964641571045, + "logps/chosen": -221.3422393798828, + "logps/rejected": -193.26048278808594, + "loss": 0.6922, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.016618115827441216, + "rewards/margins": 0.08751632273197174, + "rewards/rejected": -0.10413442552089691, + "step": 4760 + }, + { + "epoch": 0.31, + "learning_rate": 4.34577895704726e-06, + "logits/chosen": -2.372318744659424, + "logits/rejected": -2.1986355781555176, + "logps/chosen": -263.10650634765625, + "logps/rejected": -244.01565551757812, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01492463331669569, + "rewards/margins": 0.06633206456899643, + "rewards/rejected": -0.08125670254230499, + "step": 4770 + }, + { + "epoch": 0.31, + "learning_rate": 4.3419230412489954e-06, + "logits/chosen": -2.470191478729248, + "logits/rejected": -2.233651638031006, + "logps/chosen": -291.9186096191406, + "logps/rejected": -221.01748657226562, + "loss": 0.6918, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0255076102912426, + "rewards/margins": 0.051627278327941895, + "rewards/rejected": -0.07713489234447479, + "step": 4780 + }, + { + "epoch": 0.31, + "learning_rate": 4.338057517077872e-06, + "logits/chosen": -2.417341709136963, + "logits/rejected": -2.014641284942627, + "logps/chosen": -193.8696746826172, + "logps/rejected": -168.1341094970703, + "loss": 0.6813, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.0017154158558696508, + "rewards/margins": 0.17616704106330872, + "rewards/rejected": -0.17445163428783417, + "step": 4790 + }, + { + "epoch": 0.31, + "learning_rate": 4.334182404698356e-06, + "logits/chosen": -2.4133429527282715, + "logits/rejected": -1.977574110031128, + "logps/chosen": -234.8191680908203, + "logps/rejected": -160.61514282226562, + "loss": 0.6918, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04712152108550072, + "rewards/margins": 0.04195799678564072, + "rewards/rejected": -0.08907952159643173, + "step": 4800 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.3064463138580322, + "eval_logits/rejected": -2.1199657917022705, + "eval_logps/chosen": -233.79531860351562, + "eval_logps/rejected": -220.7675323486328, + "eval_loss": 0.6901711225509644, + "eval_rewards/accuracies": 0.6384999752044678, + "eval_rewards/chosen": -0.017903409898281097, + "eval_rewards/margins": 0.07365269213914871, + "eval_rewards/rejected": -0.09155610203742981, + "eval_runtime": 711.7896, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 4800 + }, + { + "epoch": 0.31, + "learning_rate": 4.330297724324933e-06, + "logits/chosen": -2.5674805641174316, + "logits/rejected": -2.029761791229248, + "logps/chosen": -308.3761291503906, + "logps/rejected": -214.9816436767578, + "loss": 0.689, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0003527544322423637, + "rewards/margins": 0.07805721461772919, + "rewards/rejected": -0.07770445942878723, + "step": 4810 + }, + { + "epoch": 0.32, + "learning_rate": 4.326403496221999e-06, + "logits/chosen": -2.235084056854248, + "logits/rejected": -2.138692617416382, + "logps/chosen": -163.4996337890625, + "logps/rejected": -143.01316833496094, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.020706117153167725, + "rewards/margins": 0.06293468922376633, + "rewards/rejected": -0.08364080637693405, + "step": 4820 + }, + { + "epoch": 0.32, + "learning_rate": 4.322499740703755e-06, + "logits/chosen": -2.19960618019104, + "logits/rejected": -2.30405592918396, + "logps/chosen": -193.19754028320312, + "logps/rejected": -224.2078094482422, + "loss": 0.6902, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.010983394458889961, + "rewards/margins": 0.06486980617046356, + "rewards/rejected": -0.07585321366786957, + "step": 4830 + }, + { + "epoch": 0.32, + "learning_rate": 4.318586478134101e-06, + "logits/chosen": -2.2257297039031982, + "logits/rejected": -2.188767671585083, + "logps/chosen": -192.26211547851562, + "logps/rejected": -158.5824737548828, + "loss": 0.6894, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004053809680044651, + "rewards/margins": 0.06840833276510239, + "rewards/rejected": -0.06435452401638031, + "step": 4840 + }, + { + "epoch": 0.32, + "learning_rate": 4.314663728926534e-06, + "logits/chosen": -2.4708011150360107, + "logits/rejected": -2.215599536895752, + "logps/chosen": -259.12109375, + "logps/rejected": -254.83389282226562, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.022453729063272476, + "rewards/margins": 0.0629110336303711, + "rewards/rejected": -0.08536475896835327, + "step": 4850 + }, + { + "epoch": 0.32, + "learning_rate": 4.310731513544033e-06, + "logits/chosen": -2.26763653755188, + "logits/rejected": -2.0921549797058105, + "logps/chosen": -245.0206756591797, + "logps/rejected": -206.82406616210938, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.021629411727190018, + "rewards/margins": 0.07882945239543915, + "rewards/rejected": -0.10045886039733887, + "step": 4860 + }, + { + "epoch": 0.32, + "learning_rate": 4.30678985249896e-06, + "logits/chosen": -2.255072832107544, + "logits/rejected": -2.1915435791015625, + "logps/chosen": -159.80569458007812, + "logps/rejected": -188.56588745117188, + "loss": 0.6911, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.026556584984064102, + "rewards/margins": 0.10496687889099121, + "rewards/rejected": -0.13152346014976501, + "step": 4870 + }, + { + "epoch": 0.32, + "learning_rate": 4.302838766352952e-06, + "logits/chosen": -2.2494466304779053, + "logits/rejected": -2.029052495956421, + "logps/chosen": -259.2070007324219, + "logps/rejected": -231.5312042236328, + "loss": 0.6896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03614037483930588, + "rewards/margins": 0.08115691691637039, + "rewards/rejected": -0.11729729175567627, + "step": 4880 + }, + { + "epoch": 0.32, + "learning_rate": 4.298878275716806e-06, + "logits/chosen": -2.175429582595825, + "logits/rejected": -2.159177780151367, + "logps/chosen": -196.0753936767578, + "logps/rejected": -203.73773193359375, + "loss": 0.6872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04195953160524368, + "rewards/margins": 0.10459339618682861, + "rewards/rejected": -0.1465529352426529, + "step": 4890 + }, + { + "epoch": 0.32, + "learning_rate": 4.294908401250386e-06, + "logits/chosen": -2.3999128341674805, + "logits/rejected": -1.9628560543060303, + "logps/chosen": -217.9789276123047, + "logps/rejected": -182.8585968017578, + "loss": 0.6886, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.032358746975660324, + "rewards/margins": 0.0941200703382492, + "rewards/rejected": -0.12647880613803864, + "step": 4900 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -2.284270763397217, + "eval_logits/rejected": -2.0991475582122803, + "eval_logps/chosen": -234.08592224121094, + "eval_logps/rejected": -222.58131408691406, + "eval_loss": 0.6902449131011963, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": -0.020809680223464966, + "eval_rewards/margins": 0.08888448029756546, + "eval_rewards/rejected": -0.10969416052103043, + "eval_runtime": 710.5969, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.407, + "step": 4900 + }, + { + "epoch": 0.32, + "learning_rate": 4.290929163662498e-06, + "logits/chosen": -2.1393580436706543, + "logits/rejected": -1.9254634380340576, + "logps/chosen": -271.26580810546875, + "logps/rejected": -221.15762329101562, + "loss": 0.6891, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.015305581502616405, + "rewards/margins": 0.076082743704319, + "rewards/rejected": -0.09138831496238708, + "step": 4910 + }, + { + "epoch": 0.32, + "learning_rate": 4.286940583710796e-06, + "logits/chosen": -2.330704689025879, + "logits/rejected": -2.207934856414795, + "logps/chosen": -294.7912292480469, + "logps/rejected": -248.0767364501953, + "loss": 0.6919, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.020425736904144287, + "rewards/margins": 0.09915411472320557, + "rewards/rejected": -0.11957985162734985, + "step": 4920 + }, + { + "epoch": 0.32, + "learning_rate": 4.282942682201667e-06, + "logits/chosen": -2.2037787437438965, + "logits/rejected": -1.929842233657837, + "logps/chosen": -255.17098999023438, + "logps/rejected": -224.1762237548828, + "loss": 0.6921, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0306318998336792, + "rewards/margins": 0.08591778576374054, + "rewards/rejected": -0.11654969304800034, + "step": 4930 + }, + { + "epoch": 0.32, + "learning_rate": 4.278935479990123e-06, + "logits/chosen": -2.479978322982788, + "logits/rejected": -2.227200508117676, + "logps/chosen": -207.30966186523438, + "logps/rejected": -171.01812744140625, + "loss": 0.6892, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.02646293118596077, + "rewards/margins": 0.06679949164390564, + "rewards/rejected": -0.09326241165399551, + "step": 4940 + }, + { + "epoch": 0.32, + "learning_rate": 4.274918997979695e-06, + "logits/chosen": -2.2375760078430176, + "logits/rejected": -2.2690727710723877, + "logps/chosen": -197.79953002929688, + "logps/rejected": -205.44857788085938, + "loss": 0.6919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0416448749601841, + "rewards/margins": 0.05913674831390381, + "rewards/rejected": -0.10078161954879761, + "step": 4950 + }, + { + "epoch": 0.32, + "learning_rate": 4.270893257122319e-06, + "logits/chosen": -2.1356120109558105, + "logits/rejected": -1.9105371236801147, + "logps/chosen": -230.14260864257812, + "logps/rejected": -272.1700744628906, + "loss": 0.6865, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014206337742507458, + "rewards/margins": 0.12518110871315002, + "rewards/rejected": -0.13938744366168976, + "step": 4960 + }, + { + "epoch": 0.33, + "learning_rate": 4.266858278418232e-06, + "logits/chosen": -2.1323647499084473, + "logits/rejected": -1.8933664560317993, + "logps/chosen": -238.7414093017578, + "logps/rejected": -223.8094940185547, + "loss": 0.6888, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.021372055634856224, + "rewards/margins": 0.05419282987713814, + "rewards/rejected": -0.07556488364934921, + "step": 4970 + }, + { + "epoch": 0.33, + "learning_rate": 4.26281408291586e-06, + "logits/chosen": -2.3737692832946777, + "logits/rejected": -2.0658111572265625, + "logps/chosen": -242.1832275390625, + "logps/rejected": -223.305419921875, + "loss": 0.6893, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0015409494517371058, + "rewards/margins": 0.10654549300670624, + "rewards/rejected": -0.10500454902648926, + "step": 4980 + }, + { + "epoch": 0.33, + "learning_rate": 4.258760691711706e-06, + "logits/chosen": -2.291581392288208, + "logits/rejected": -2.1656148433685303, + "logps/chosen": -198.389404296875, + "logps/rejected": -202.20599365234375, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.009278281591832638, + "rewards/margins": 0.08409412950277328, + "rewards/rejected": -0.0933724194765091, + "step": 4990 + }, + { + "epoch": 0.33, + "learning_rate": 4.254698125950247e-06, + "logits/chosen": -2.530463695526123, + "logits/rejected": -2.2810826301574707, + "logps/chosen": -300.74664306640625, + "logps/rejected": -259.6216735839844, + "loss": 0.6923, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0026790399570018053, + "rewards/margins": 0.05540703609585762, + "rewards/rejected": -0.05272800475358963, + "step": 5000 + }, + { + "epoch": 0.33, + "eval_logits/chosen": -2.2863876819610596, + "eval_logits/rejected": -2.1009578704833984, + "eval_logps/chosen": -232.66943359375, + "eval_logps/rejected": -220.42218017578125, + "eval_loss": 0.6900946497917175, + "eval_rewards/accuracies": 0.6269999742507935, + "eval_rewards/chosen": -0.006644845940172672, + "eval_rewards/margins": 0.08145791292190552, + "eval_rewards/rejected": -0.08810276538133621, + "eval_runtime": 708.8089, + "eval_samples_per_second": 2.822, + "eval_steps_per_second": 1.411, + "step": 5000 + }, + { + "epoch": 0.33, + "learning_rate": 4.250626406823815e-06, + "logits/chosen": -2.3481221199035645, + "logits/rejected": -2.086394786834717, + "logps/chosen": -217.6735076904297, + "logps/rejected": -247.91268920898438, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008107764646410942, + "rewards/margins": 0.14061132073402405, + "rewards/rejected": -0.14871908724308014, + "step": 5010 + }, + { + "epoch": 0.33, + "learning_rate": 4.246545555572489e-06, + "logits/chosen": -2.260010242462158, + "logits/rejected": -2.139444351196289, + "logps/chosen": -153.61056518554688, + "logps/rejected": -189.370849609375, + "loss": 0.688, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.005855654366314411, + "rewards/margins": 0.10808217525482178, + "rewards/rejected": -0.10222651809453964, + "step": 5020 + }, + { + "epoch": 0.33, + "learning_rate": 4.242455593483992e-06, + "logits/chosen": -2.340317964553833, + "logits/rejected": -2.138221263885498, + "logps/chosen": -218.4593048095703, + "logps/rejected": -174.04708862304688, + "loss": 0.692, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.006946141831576824, + "rewards/margins": 0.052972644567489624, + "rewards/rejected": -0.04602649435400963, + "step": 5030 + }, + { + "epoch": 0.33, + "learning_rate": 4.238356541893567e-06, + "logits/chosen": -2.2381134033203125, + "logits/rejected": -2.103583812713623, + "logps/chosen": -190.62313842773438, + "logps/rejected": -182.2062225341797, + "loss": 0.6898, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0049001253210008144, + "rewards/margins": 0.07125677168369293, + "rewards/rejected": -0.07615689188241959, + "step": 5040 + }, + { + "epoch": 0.33, + "learning_rate": 4.234248422183876e-06, + "logits/chosen": -2.0986828804016113, + "logits/rejected": -2.2984180450439453, + "logps/chosen": -241.3805694580078, + "logps/rejected": -242.21878051757812, + "loss": 0.6931, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01554117538034916, + "rewards/margins": 0.0539456307888031, + "rewards/rejected": -0.03840445727109909, + "step": 5050 + }, + { + "epoch": 0.33, + "learning_rate": 4.230131255784884e-06, + "logits/chosen": -2.5071334838867188, + "logits/rejected": -2.1970627307891846, + "logps/chosen": -253.6757354736328, + "logps/rejected": -244.3105926513672, + "loss": 0.688, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04207398369908333, + "rewards/margins": 0.09431316703557968, + "rewards/rejected": -0.05223918706178665, + "step": 5060 + }, + { + "epoch": 0.33, + "learning_rate": 4.226005064173748e-06, + "logits/chosen": -2.299450159072876, + "logits/rejected": -2.1493353843688965, + "logps/chosen": -261.6854553222656, + "logps/rejected": -286.0189208984375, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.029588108882308006, + "rewards/margins": 0.058764077723026276, + "rewards/rejected": -0.02917597070336342, + "step": 5070 + }, + { + "epoch": 0.33, + "learning_rate": 4.2218698688747035e-06, + "logits/chosen": -2.133448362350464, + "logits/rejected": -1.9765691757202148, + "logps/chosen": -233.8523712158203, + "logps/rejected": -196.45217895507812, + "loss": 0.6907, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01143039483577013, + "rewards/margins": 0.07730044424533844, + "rewards/rejected": -0.0887308269739151, + "step": 5080 + }, + { + "epoch": 0.33, + "learning_rate": 4.217725691458957e-06, + "logits/chosen": -2.4555492401123047, + "logits/rejected": -2.2806808948516846, + "logps/chosen": -183.96484375, + "logps/rejected": -221.15011596679688, + "loss": 0.6875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0025927810929715633, + "rewards/margins": 0.10239323228597641, + "rewards/rejected": -0.09980045258998871, + "step": 5090 + }, + { + "epoch": 0.33, + "learning_rate": 4.213572553544565e-06, + "logits/chosen": -2.343311071395874, + "logits/rejected": -2.1145055294036865, + "logps/chosen": -239.4435577392578, + "logps/rejected": -242.26553344726562, + "loss": 0.6914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.015282916836440563, + "rewards/margins": 0.10186527669429779, + "rewards/rejected": -0.08658237755298615, + "step": 5100 + }, + { + "epoch": 0.33, + "eval_logits/chosen": -2.304870843887329, + "eval_logits/rejected": -2.1186633110046387, + "eval_logps/chosen": -232.49884033203125, + "eval_logps/rejected": -220.59129333496094, + "eval_loss": 0.6902390122413635, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": -0.004938941914588213, + "eval_rewards/margins": 0.08485515415668488, + "eval_rewards/rejected": -0.0897940918803215, + "eval_runtime": 709.3713, + "eval_samples_per_second": 2.819, + "eval_steps_per_second": 1.41, + "step": 5100 + }, + { + "epoch": 0.33, + "learning_rate": 4.209410476796331e-06, + "logits/chosen": -2.2306551933288574, + "logits/rejected": -2.1737866401672363, + "logps/chosen": -172.79660034179688, + "logps/rejected": -172.89422607421875, + "loss": 0.6879, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.017051298171281815, + "rewards/margins": 0.10086864233016968, + "rewards/rejected": -0.11791994422674179, + "step": 5110 + }, + { + "epoch": 0.33, + "learning_rate": 4.205239482925686e-06, + "logits/chosen": -2.1063780784606934, + "logits/rejected": -2.153102397918701, + "logps/chosen": -188.4702911376953, + "logps/rejected": -215.7820587158203, + "loss": 0.6914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.006549468729645014, + "rewards/margins": 0.05185345560312271, + "rewards/rejected": -0.05840292572975159, + "step": 5120 + }, + { + "epoch": 0.34, + "learning_rate": 4.201059593690577e-06, + "logits/chosen": -2.3719985485076904, + "logits/rejected": -2.2848258018493652, + "logps/chosen": -225.9248046875, + "logps/rejected": -207.27273559570312, + "loss": 0.6909, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0033702123910188675, + "rewards/margins": 0.06776970624923706, + "rewards/rejected": -0.07113991677761078, + "step": 5130 + }, + { + "epoch": 0.34, + "learning_rate": 4.196870830895354e-06, + "logits/chosen": -2.1750078201293945, + "logits/rejected": -2.1885952949523926, + "logps/chosen": -259.3610534667969, + "logps/rejected": -315.05426025390625, + "loss": 0.6913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0161347184330225, + "rewards/margins": 0.06185835599899292, + "rewards/rejected": -0.07799308001995087, + "step": 5140 + }, + { + "epoch": 0.34, + "learning_rate": 4.192673216390657e-06, + "logits/chosen": -2.3547465801239014, + "logits/rejected": -2.0885729789733887, + "logps/chosen": -235.29983520507812, + "logps/rejected": -204.89700317382812, + "loss": 0.6884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02326892875134945, + "rewards/margins": 0.07924740761518478, + "rewards/rejected": -0.10251633077859879, + "step": 5150 + }, + { + "epoch": 0.34, + "learning_rate": 4.188466772073296e-06, + "logits/chosen": -2.447526454925537, + "logits/rejected": -2.1424593925476074, + "logps/chosen": -224.17477416992188, + "logps/rejected": -207.6041259765625, + "loss": 0.6906, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03650229796767235, + "rewards/margins": 0.04190974682569504, + "rewards/rejected": -0.07841204106807709, + "step": 5160 + }, + { + "epoch": 0.34, + "learning_rate": 4.184251519886148e-06, + "logits/chosen": -2.190013885498047, + "logits/rejected": -2.2769131660461426, + "logps/chosen": -201.77218627929688, + "logps/rejected": -237.6571807861328, + "loss": 0.6888, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.056646548211574554, + "rewards/margins": 0.07229969650506973, + "rewards/rejected": -0.1289462298154831, + "step": 5170 + }, + { + "epoch": 0.34, + "learning_rate": 4.180027481818033e-06, + "logits/chosen": -2.3080639839172363, + "logits/rejected": -2.2970948219299316, + "logps/chosen": -269.02783203125, + "logps/rejected": -236.8043975830078, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.023297477513551712, + "rewards/margins": 0.05610079690814018, + "rewards/rejected": -0.0793982669711113, + "step": 5180 + }, + { + "epoch": 0.34, + "learning_rate": 4.175794679903602e-06, + "logits/chosen": -2.364243745803833, + "logits/rejected": -2.1234567165374756, + "logps/chosen": -227.66976928710938, + "logps/rejected": -164.52684020996094, + "loss": 0.6917, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.00028926803497597575, + "rewards/margins": 0.09214087575674057, + "rewards/rejected": -0.09243014454841614, + "step": 5190 + }, + { + "epoch": 0.34, + "learning_rate": 4.171553136223222e-06, + "logits/chosen": -2.3197431564331055, + "logits/rejected": -2.314145803451538, + "logps/chosen": -269.05267333984375, + "logps/rejected": -290.14276123046875, + "loss": 0.6895, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.017318129539489746, + "rewards/margins": 0.10380220413208008, + "rewards/rejected": -0.12112033367156982, + "step": 5200 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -2.323685884475708, + "eval_logits/rejected": -2.1359636783599854, + "eval_logps/chosen": -234.24876403808594, + "eval_logps/rejected": -221.4422149658203, + "eval_loss": 0.690197765827179, + "eval_rewards/accuracies": 0.6294999718666077, + "eval_rewards/chosen": -0.022438107058405876, + "eval_rewards/margins": 0.0758652612566948, + "eval_rewards/rejected": -0.09830336272716522, + "eval_runtime": 712.3382, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 5200 + }, + { + "epoch": 0.34, + "learning_rate": 4.167302872902865e-06, + "logits/chosen": -2.3222103118896484, + "logits/rejected": -2.2121872901916504, + "logps/chosen": -255.7134246826172, + "logps/rejected": -254.5579071044922, + "loss": 0.6889, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.042452942579984665, + "rewards/margins": 0.10782526433467865, + "rewards/rejected": -0.1502782106399536, + "step": 5210 + }, + { + "epoch": 0.34, + "learning_rate": 4.163043912113985e-06, + "logits/chosen": -2.354125499725342, + "logits/rejected": -2.1191887855529785, + "logps/chosen": -261.3983459472656, + "logps/rejected": -233.1907196044922, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02358170412480831, + "rewards/margins": 0.05787007883191109, + "rewards/rejected": -0.08145178854465485, + "step": 5220 + }, + { + "epoch": 0.34, + "learning_rate": 4.15877627607341e-06, + "logits/chosen": -2.1449849605560303, + "logits/rejected": -2.0459647178649902, + "logps/chosen": -218.0535430908203, + "logps/rejected": -200.9430694580078, + "loss": 0.6917, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.014587024226784706, + "rewards/margins": 0.0711187794804573, + "rewards/rejected": -0.08570580184459686, + "step": 5230 + }, + { + "epoch": 0.34, + "learning_rate": 4.154499987043217e-06, + "logits/chosen": -2.3814520835876465, + "logits/rejected": -2.1603853702545166, + "logps/chosen": -225.3390655517578, + "logps/rejected": -213.6746826171875, + "loss": 0.6887, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011354709044098854, + "rewards/margins": 0.10515379905700684, + "rewards/rejected": -0.11650850623846054, + "step": 5240 + }, + { + "epoch": 0.34, + "learning_rate": 4.150215067330625e-06, + "logits/chosen": -2.206449508666992, + "logits/rejected": -2.1436514854431152, + "logps/chosen": -211.77523803710938, + "logps/rejected": -235.5076446533203, + "loss": 0.6894, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02726421132683754, + "rewards/margins": 0.09695123136043549, + "rewards/rejected": -0.12421544641256332, + "step": 5250 + }, + { + "epoch": 0.34, + "learning_rate": 4.145921539287876e-06, + "logits/chosen": -2.2395777702331543, + "logits/rejected": -1.96908438205719, + "logps/chosen": -193.36337280273438, + "logps/rejected": -183.28408813476562, + "loss": 0.6898, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.027707424014806747, + "rewards/margins": 0.09924031049013138, + "rewards/rejected": -0.12694773077964783, + "step": 5260 + }, + { + "epoch": 0.34, + "learning_rate": 4.141619425312115e-06, + "logits/chosen": -2.3244357109069824, + "logits/rejected": -1.956451654434204, + "logps/chosen": -211.45498657226562, + "logps/rejected": -193.4413299560547, + "loss": 0.6913, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.028037140145897865, + "rewards/margins": 0.06428249180316925, + "rewards/rejected": -0.09231962263584137, + "step": 5270 + }, + { + "epoch": 0.35, + "learning_rate": 4.1373087478452735e-06, + "logits/chosen": -2.4381213188171387, + "logits/rejected": -2.04194974899292, + "logps/chosen": -223.4126739501953, + "logps/rejected": -183.11099243164062, + "loss": 0.6853, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.01827925816178322, + "rewards/margins": 0.14312221109867096, + "rewards/rejected": -0.12484294176101685, + "step": 5280 + }, + { + "epoch": 0.35, + "learning_rate": 4.132989529373959e-06, + "logits/chosen": -2.352229595184326, + "logits/rejected": -1.917790174484253, + "logps/chosen": -258.89263916015625, + "logps/rejected": -188.7851104736328, + "loss": 0.6894, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029797937721014023, + "rewards/margins": 0.08890985697507858, + "rewards/rejected": -0.1187077984213829, + "step": 5290 + }, + { + "epoch": 0.35, + "learning_rate": 4.128661792429331e-06, + "logits/chosen": -2.3550148010253906, + "logits/rejected": -2.1839187145233154, + "logps/chosen": -257.23114013671875, + "logps/rejected": -266.33551025390625, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.034605689346790314, + "rewards/margins": 0.042079828679561615, + "rewards/rejected": -0.07668552547693253, + "step": 5300 + }, + { + "epoch": 0.35, + "eval_logits/chosen": -2.310957908630371, + "eval_logits/rejected": -2.124268054962158, + "eval_logps/chosen": -235.38356018066406, + "eval_logps/rejected": -223.17697143554688, + "eval_loss": 0.6903428435325623, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -0.03378620371222496, + "eval_rewards/margins": 0.08186446130275726, + "eval_rewards/rejected": -0.11565067619085312, + "eval_runtime": 710.7294, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 5300 + }, + { + "epoch": 0.35, + "learning_rate": 4.124325559586985e-06, + "logits/chosen": -2.0685927867889404, + "logits/rejected": -2.1043925285339355, + "logps/chosen": -195.6849822998047, + "logps/rejected": -210.3332977294922, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07628266513347626, + "rewards/margins": 0.017243212088942528, + "rewards/rejected": -0.09352587163448334, + "step": 5310 + }, + { + "epoch": 0.35, + "learning_rate": 4.119980853466835e-06, + "logits/chosen": -2.27421236038208, + "logits/rejected": -1.881087064743042, + "logps/chosen": -213.4599609375, + "logps/rejected": -195.97486877441406, + "loss": 0.6895, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.04305556043982506, + "rewards/margins": 0.09629078209400177, + "rewards/rejected": -0.13934634625911713, + "step": 5320 + }, + { + "epoch": 0.35, + "learning_rate": 4.115627696732997e-06, + "logits/chosen": -2.199984550476074, + "logits/rejected": -2.0089831352233887, + "logps/chosen": -194.8700714111328, + "logps/rejected": -181.50704956054688, + "loss": 0.6921, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0345582440495491, + "rewards/margins": 0.05957914516329765, + "rewards/rejected": -0.09413739293813705, + "step": 5330 + }, + { + "epoch": 0.35, + "learning_rate": 4.111266112093668e-06, + "logits/chosen": -2.3105640411376953, + "logits/rejected": -2.106968402862549, + "logps/chosen": -206.64205932617188, + "logps/rejected": -236.0433349609375, + "loss": 0.6889, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.03174377605319023, + "rewards/margins": 0.1100342720746994, + "rewards/rejected": -0.14177805185317993, + "step": 5340 + }, + { + "epoch": 0.35, + "learning_rate": 4.1068961223010115e-06, + "logits/chosen": -2.2916486263275146, + "logits/rejected": -1.9959255456924438, + "logps/chosen": -273.03662109375, + "logps/rejected": -259.06805419921875, + "loss": 0.6876, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.023360053077340126, + "rewards/margins": 0.08707686513662338, + "rewards/rejected": -0.11043691635131836, + "step": 5350 + }, + { + "epoch": 0.35, + "learning_rate": 4.102517750151034e-06, + "logits/chosen": -2.3448870182037354, + "logits/rejected": -2.100322961807251, + "logps/chosen": -295.0011901855469, + "logps/rejected": -228.78164672851562, + "loss": 0.6917, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00043646543053910136, + "rewards/margins": 0.05093403905630112, + "rewards/rejected": -0.050497572869062424, + "step": 5360 + }, + { + "epoch": 0.35, + "learning_rate": 4.09813101848347e-06, + "logits/chosen": -2.22782564163208, + "logits/rejected": -2.258028984069824, + "logps/chosen": -216.62051391601562, + "logps/rejected": -240.86264038085938, + "loss": 0.694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.023654133081436157, + "rewards/margins": 0.07087962329387665, + "rewards/rejected": -0.04722550883889198, + "step": 5370 + }, + { + "epoch": 0.35, + "learning_rate": 4.093735950181659e-06, + "logits/chosen": -2.1826157569885254, + "logits/rejected": -2.083122968673706, + "logps/chosen": -220.95883178710938, + "logps/rejected": -255.71231079101562, + "loss": 0.6878, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04514528065919876, + "rewards/margins": 0.10210150480270386, + "rewards/rejected": -0.0569562129676342, + "step": 5380 + }, + { + "epoch": 0.35, + "learning_rate": 4.0893325681724326e-06, + "logits/chosen": -2.2918612957000732, + "logits/rejected": -2.234261989593506, + "logps/chosen": -258.1329040527344, + "logps/rejected": -252.94076538085938, + "loss": 0.6906, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0028874489944428205, + "rewards/margins": 0.07087242603302002, + "rewards/rejected": -0.06798496842384338, + "step": 5390 + }, + { + "epoch": 0.35, + "learning_rate": 4.084920895425988e-06, + "logits/chosen": -2.240539073944092, + "logits/rejected": -2.236994981765747, + "logps/chosen": -237.4442901611328, + "logps/rejected": -259.9585876464844, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.000796229753177613, + "rewards/margins": 0.0886940062046051, + "rewards/rejected": -0.08949021995067596, + "step": 5400 + }, + { + "epoch": 0.35, + "eval_logits/chosen": -2.296610116958618, + "eval_logits/rejected": -2.1113150119781494, + "eval_logps/chosen": -229.67491149902344, + "eval_logps/rejected": -216.7387237548828, + "eval_loss": 0.6902357339859009, + "eval_rewards/accuracies": 0.6334999799728394, + "eval_rewards/chosen": 0.023300452157855034, + "eval_rewards/margins": 0.07456869632005692, + "eval_rewards/rejected": -0.05126824975013733, + "eval_runtime": 710.671, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 5400 + }, + { + "epoch": 0.35, + "learning_rate": 4.080500954955769e-06, + "logits/chosen": -2.2135491371154785, + "logits/rejected": -1.9314069747924805, + "logps/chosen": -254.9764862060547, + "logps/rejected": -243.9836883544922, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008954535238444805, + "rewards/margins": 0.07293415814638138, + "rewards/rejected": -0.0639796257019043, + "step": 5410 + }, + { + "epoch": 0.35, + "learning_rate": 4.076072769818354e-06, + "logits/chosen": -2.4696648120880127, + "logits/rejected": -2.026599168777466, + "logps/chosen": -242.1929473876953, + "logps/rejected": -186.80911254882812, + "loss": 0.6906, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.015293831005692482, + "rewards/margins": 0.06581826508045197, + "rewards/rejected": -0.05052444338798523, + "step": 5420 + }, + { + "epoch": 0.36, + "learning_rate": 4.071636363113323e-06, + "logits/chosen": -2.07266902923584, + "logits/rejected": -1.993199110031128, + "logps/chosen": -264.7987976074219, + "logps/rejected": -217.65054321289062, + "loss": 0.6902, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.033312998712062836, + "rewards/margins": 0.0689430683851242, + "rewards/rejected": -0.03563006594777107, + "step": 5430 + }, + { + "epoch": 0.36, + "learning_rate": 4.067191757983146e-06, + "logits/chosen": -2.0450809001922607, + "logits/rejected": -2.091036558151245, + "logps/chosen": -226.9083709716797, + "logps/rejected": -234.5966033935547, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02667292393743992, + "rewards/margins": 0.10902222245931625, + "rewards/rejected": -0.08234930038452148, + "step": 5440 + }, + { + "epoch": 0.36, + "learning_rate": 4.062738977613063e-06, + "logits/chosen": -2.237396717071533, + "logits/rejected": -2.115265130996704, + "logps/chosen": -232.11581420898438, + "logps/rejected": -191.1370391845703, + "loss": 0.689, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.023604759946465492, + "rewards/margins": 0.07414443045854568, + "rewards/rejected": -0.05053967237472534, + "step": 5450 + }, + { + "epoch": 0.36, + "learning_rate": 4.058278045230957e-06, + "logits/chosen": -2.193748950958252, + "logits/rejected": -2.1995644569396973, + "logps/chosen": -218.991455078125, + "logps/rejected": -224.9091796875, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.006245986558496952, + "rewards/margins": 0.04330100864171982, + "rewards/rejected": -0.049546997994184494, + "step": 5460 + }, + { + "epoch": 0.36, + "learning_rate": 4.053808984107235e-06, + "logits/chosen": -2.3936734199523926, + "logits/rejected": -2.0713841915130615, + "logps/chosen": -233.24472045898438, + "logps/rejected": -198.1108856201172, + "loss": 0.6919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.003764042630791664, + "rewards/margins": 0.05159348249435425, + "rewards/rejected": -0.05535752698779106, + "step": 5470 + }, + { + "epoch": 0.36, + "learning_rate": 4.04933181755471e-06, + "logits/chosen": -2.3855273723602295, + "logits/rejected": -2.308960437774658, + "logps/chosen": -206.5284423828125, + "logps/rejected": -206.90713500976562, + "loss": 0.6901, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0005217326688580215, + "rewards/margins": 0.0759122297167778, + "rewards/rejected": -0.07539048790931702, + "step": 5480 + }, + { + "epoch": 0.36, + "learning_rate": 4.044846568928477e-06, + "logits/chosen": -2.2790474891662598, + "logits/rejected": -2.3462178707122803, + "logps/chosen": -261.7860107421875, + "logps/rejected": -259.61175537109375, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.002618322381749749, + "rewards/margins": 0.06683683395385742, + "rewards/rejected": -0.06945516169071198, + "step": 5490 + }, + { + "epoch": 0.36, + "learning_rate": 4.040353261625788e-06, + "logits/chosen": -2.444617748260498, + "logits/rejected": -2.0571980476379395, + "logps/chosen": -275.5302734375, + "logps/rejected": -246.3635711669922, + "loss": 0.6884, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.010149752721190453, + "rewards/margins": 0.09463830292224884, + "rewards/rejected": -0.08448855578899384, + "step": 5500 + }, + { + "epoch": 0.36, + "eval_logits/chosen": -2.290515661239624, + "eval_logits/rejected": -2.1054341793060303, + "eval_logps/chosen": -232.49339294433594, + "eval_logps/rejected": -219.36749267578125, + "eval_loss": 0.6903730630874634, + "eval_rewards/accuracies": 0.6230000257492065, + "eval_rewards/chosen": -0.004884431138634682, + "eval_rewards/margins": 0.07267154008150101, + "eval_rewards/rejected": -0.07755597680807114, + "eval_runtime": 709.5644, + "eval_samples_per_second": 2.819, + "eval_steps_per_second": 1.409, + "step": 5500 + }, + { + "epoch": 0.36, + "learning_rate": 4.035851919085936e-06, + "logits/chosen": -2.2773475646972656, + "logits/rejected": -2.1437253952026367, + "logps/chosen": -268.17034912109375, + "logps/rejected": -197.60702514648438, + "loss": 0.688, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.007762535475194454, + "rewards/margins": 0.08148452639579773, + "rewards/rejected": -0.08924706280231476, + "step": 5510 + }, + { + "epoch": 0.36, + "learning_rate": 4.031342564790128e-06, + "logits/chosen": -2.2170324325561523, + "logits/rejected": -2.0734400749206543, + "logps/chosen": -204.36358642578125, + "logps/rejected": -210.1439666748047, + "loss": 0.6871, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.016081126406788826, + "rewards/margins": 0.10553131252527237, + "rewards/rejected": -0.0894501805305481, + "step": 5520 + }, + { + "epoch": 0.36, + "learning_rate": 4.026825222261367e-06, + "logits/chosen": -2.228926658630371, + "logits/rejected": -1.9831037521362305, + "logps/chosen": -179.34925842285156, + "logps/rejected": -177.38687133789062, + "loss": 0.6911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06274209171533585, + "rewards/margins": 0.05180812627077103, + "rewards/rejected": -0.11455021053552628, + "step": 5530 + }, + { + "epoch": 0.36, + "learning_rate": 4.022299915064321e-06, + "logits/chosen": -2.300727605819702, + "logits/rejected": -2.096855878829956, + "logps/chosen": -305.073486328125, + "logps/rejected": -261.6318054199219, + "loss": 0.6916, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.007745129056274891, + "rewards/margins": 0.09663836658000946, + "rewards/rejected": -0.08889324963092804, + "step": 5540 + }, + { + "epoch": 0.36, + "learning_rate": 4.017766666805213e-06, + "logits/chosen": -2.120983839035034, + "logits/rejected": -2.074357271194458, + "logps/chosen": -215.736328125, + "logps/rejected": -188.95654296875, + "loss": 0.689, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02215953730046749, + "rewards/margins": 0.08581807464361191, + "rewards/rejected": -0.10797761380672455, + "step": 5550 + }, + { + "epoch": 0.36, + "learning_rate": 4.013225501131684e-06, + "logits/chosen": -2.3249385356903076, + "logits/rejected": -2.043680191040039, + "logps/chosen": -217.39242553710938, + "logps/rejected": -197.59507751464844, + "loss": 0.691, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.00014842339442111552, + "rewards/margins": 0.071754589676857, + "rewards/rejected": -0.07190301269292831, + "step": 5560 + }, + { + "epoch": 0.36, + "learning_rate": 4.008676441732679e-06, + "logits/chosen": -2.2676405906677246, + "logits/rejected": -1.9487760066986084, + "logps/chosen": -207.52322387695312, + "logps/rejected": -164.7802734375, + "loss": 0.6913, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.012288715690374374, + "rewards/margins": 0.05257941409945488, + "rewards/rejected": -0.06486812978982925, + "step": 5570 + }, + { + "epoch": 0.37, + "learning_rate": 4.00411951233832e-06, + "logits/chosen": -2.412111520767212, + "logits/rejected": -2.0756661891937256, + "logps/chosen": -220.82858276367188, + "logps/rejected": -184.4314727783203, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011034145951271057, + "rewards/margins": 0.0827646404504776, + "rewards/rejected": -0.09379879385232925, + "step": 5580 + }, + { + "epoch": 0.37, + "learning_rate": 3.999554736719785e-06, + "logits/chosen": -2.14727520942688, + "logits/rejected": -2.033402919769287, + "logps/chosen": -291.1112976074219, + "logps/rejected": -254.0940704345703, + "loss": 0.6907, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.008655655197799206, + "rewards/margins": 0.06500723958015442, + "rewards/rejected": -0.056351590901613235, + "step": 5590 + }, + { + "epoch": 0.37, + "learning_rate": 3.994982138689177e-06, + "logits/chosen": -2.4407970905303955, + "logits/rejected": -2.242966413497925, + "logps/chosen": -236.0352783203125, + "logps/rejected": -238.973876953125, + "loss": 0.6901, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.013648864813148975, + "rewards/margins": 0.047897934913635254, + "rewards/rejected": -0.03424907475709915, + "step": 5600 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -2.2870538234710693, + "eval_logits/rejected": -2.1021108627319336, + "eval_logps/chosen": -232.2427520751953, + "eval_logps/rejected": -219.23269653320312, + "eval_loss": 0.6902966499328613, + "eval_rewards/accuracies": 0.6340000033378601, + "eval_rewards/chosen": -0.0023779442999511957, + "eval_rewards/margins": 0.07382997125387192, + "eval_rewards/rejected": -0.07620792090892792, + "eval_runtime": 710.4152, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.408, + "step": 5600 + }, + { + "epoch": 0.37, + "learning_rate": 3.990401742099408e-06, + "logits/chosen": -2.104093074798584, + "logits/rejected": -2.1244332790374756, + "logps/chosen": -179.53659057617188, + "logps/rejected": -176.4581298828125, + "loss": 0.692, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.006734983064234257, + "rewards/margins": 0.04270303249359131, + "rewards/rejected": -0.04943801462650299, + "step": 5610 + }, + { + "epoch": 0.37, + "learning_rate": 3.985813570844072e-06, + "logits/chosen": -2.2303576469421387, + "logits/rejected": -2.086726188659668, + "logps/chosen": -297.89227294921875, + "logps/rejected": -276.9810485839844, + "loss": 0.6915, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03505517542362213, + "rewards/margins": 0.06640339642763138, + "rewards/rejected": -0.10145857185125351, + "step": 5620 + }, + { + "epoch": 0.37, + "learning_rate": 3.981217648857316e-06, + "logits/chosen": -2.308424711227417, + "logits/rejected": -2.119347333908081, + "logps/chosen": -172.80674743652344, + "logps/rejected": -184.7308349609375, + "loss": 0.6899, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.002663122955709696, + "rewards/margins": 0.10142280906438828, + "rewards/rejected": -0.10408592224121094, + "step": 5630 + }, + { + "epoch": 0.37, + "learning_rate": 3.97661400011372e-06, + "logits/chosen": -2.1583411693573, + "logits/rejected": -2.204422950744629, + "logps/chosen": -244.61489868164062, + "logps/rejected": -238.1432342529297, + "loss": 0.6926, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.015875743702054024, + "rewards/margins": 0.049512267112731934, + "rewards/rejected": -0.0653880164027214, + "step": 5640 + }, + { + "epoch": 0.37, + "learning_rate": 3.972002648628174e-06, + "logits/chosen": -2.1991384029388428, + "logits/rejected": -1.8762391805648804, + "logps/chosen": -277.7632751464844, + "logps/rejected": -236.1746368408203, + "loss": 0.6912, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.007324635982513428, + "rewards/margins": 0.06103179603815079, + "rewards/rejected": -0.06835642457008362, + "step": 5650 + }, + { + "epoch": 0.37, + "learning_rate": 3.967383618455743e-06, + "logits/chosen": -2.3441319465637207, + "logits/rejected": -2.178091526031494, + "logps/chosen": -231.02035522460938, + "logps/rejected": -256.923095703125, + "loss": 0.6892, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.039006225764751434, + "rewards/margins": 0.07712291181087494, + "rewards/rejected": -0.11612913757562637, + "step": 5660 + }, + { + "epoch": 0.37, + "learning_rate": 3.9627569336915515e-06, + "logits/chosen": -2.477116107940674, + "logits/rejected": -2.1675541400909424, + "logps/chosen": -247.08352661132812, + "logps/rejected": -199.73582458496094, + "loss": 0.6887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.003792487783357501, + "rewards/margins": 0.09328923374414444, + "rewards/rejected": -0.08949675410985947, + "step": 5670 + }, + { + "epoch": 0.37, + "learning_rate": 3.9581226184706555e-06, + "logits/chosen": -2.326864719390869, + "logits/rejected": -2.435724973678589, + "logps/chosen": -193.1772918701172, + "logps/rejected": -269.46685791015625, + "loss": 0.6899, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.014881017617881298, + "rewards/margins": 0.055919043719768524, + "rewards/rejected": -0.04103802144527435, + "step": 5680 + }, + { + "epoch": 0.37, + "learning_rate": 3.953480696967912e-06, + "logits/chosen": -1.9913456439971924, + "logits/rejected": -2.222503662109375, + "logps/chosen": -210.5967254638672, + "logps/rejected": -254.0486602783203, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.012251259759068489, + "rewards/margins": 0.07006217539310455, + "rewards/rejected": -0.0823134332895279, + "step": 5690 + }, + { + "epoch": 0.37, + "learning_rate": 3.948831193397857e-06, + "logits/chosen": -2.2036399841308594, + "logits/rejected": -2.187148332595825, + "logps/chosen": -164.9729461669922, + "logps/rejected": -178.32603454589844, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010424559935927391, + "rewards/margins": 0.07527212798595428, + "rewards/rejected": -0.06484757363796234, + "step": 5700 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -2.2823386192321777, + "eval_logits/rejected": -2.0973258018493652, + "eval_logps/chosen": -230.5282440185547, + "eval_logps/rejected": -218.62940979003906, + "eval_loss": 0.6900830268859863, + "eval_rewards/accuracies": 0.6345000267028809, + "eval_rewards/chosen": 0.014767038635909557, + "eval_rewards/margins": 0.08494207262992859, + "eval_rewards/rejected": -0.07017502933740616, + "eval_runtime": 714.2649, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 5700 + }, + { + "epoch": 0.37, + "learning_rate": 3.94417413201458e-06, + "logits/chosen": -2.185957193374634, + "logits/rejected": -2.038264274597168, + "logps/chosen": -208.8540496826172, + "logps/rejected": -198.67984008789062, + "loss": 0.6888, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.027174552902579308, + "rewards/margins": 0.07592582702636719, + "rewards/rejected": -0.04875127598643303, + "step": 5710 + }, + { + "epoch": 0.37, + "learning_rate": 3.9395095371115935e-06, + "logits/chosen": -2.339968204498291, + "logits/rejected": -2.060576915740967, + "logps/chosen": -209.5446319580078, + "logps/rejected": -207.98910522460938, + "loss": 0.6885, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.031124601140618324, + "rewards/margins": 0.09055650979280472, + "rewards/rejected": -0.059431903064250946, + "step": 5720 + }, + { + "epoch": 0.37, + "learning_rate": 3.93483743302171e-06, + "logits/chosen": -2.2814464569091797, + "logits/rejected": -2.04420804977417, + "logps/chosen": -208.9713592529297, + "logps/rejected": -190.6579132080078, + "loss": 0.6899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03820297122001648, + "rewards/margins": 0.0542188361287117, + "rewards/rejected": -0.016015859320759773, + "step": 5730 + }, + { + "epoch": 0.38, + "learning_rate": 3.930157844116913e-06, + "logits/chosen": -2.1364564895629883, + "logits/rejected": -2.0696628093719482, + "logps/chosen": -202.4792938232422, + "logps/rejected": -195.42135620117188, + "loss": 0.69, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07277990877628326, + "rewards/margins": 0.07660780847072601, + "rewards/rejected": -0.003827892942354083, + "step": 5740 + }, + { + "epoch": 0.38, + "learning_rate": 3.925470794808229e-06, + "logits/chosen": -2.2870192527770996, + "logits/rejected": -1.9282214641571045, + "logps/chosen": -236.171875, + "logps/rejected": -211.88504028320312, + "loss": 0.6897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.028158491477370262, + "rewards/margins": 0.08530527353286743, + "rewards/rejected": -0.05714678019285202, + "step": 5750 + }, + { + "epoch": 0.38, + "learning_rate": 3.920776309545606e-06, + "logits/chosen": -2.3560986518859863, + "logits/rejected": -2.188704013824463, + "logps/chosen": -147.86883544921875, + "logps/rejected": -148.76467895507812, + "loss": 0.6909, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.04838157445192337, + "rewards/margins": 0.07728347182273865, + "rewards/rejected": -0.028901899233460426, + "step": 5760 + }, + { + "epoch": 0.38, + "learning_rate": 3.916074412817778e-06, + "logits/chosen": -2.2517282962799072, + "logits/rejected": -1.886130690574646, + "logps/chosen": -239.67239379882812, + "logps/rejected": -250.0625457763672, + "loss": 0.6881, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03661612793803215, + "rewards/margins": 0.09867843240499496, + "rewards/rejected": -0.062062300741672516, + "step": 5770 + }, + { + "epoch": 0.38, + "learning_rate": 3.911365129152139e-06, + "logits/chosen": -2.3809988498687744, + "logits/rejected": -2.224855899810791, + "logps/chosen": -227.54214477539062, + "logps/rejected": -230.674560546875, + "loss": 0.6901, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04891723394393921, + "rewards/margins": 0.08672243356704712, + "rewards/rejected": -0.03780519217252731, + "step": 5780 + }, + { + "epoch": 0.38, + "learning_rate": 3.906648483114623e-06, + "logits/chosen": -2.2592244148254395, + "logits/rejected": -2.1565563678741455, + "logps/chosen": -195.49063110351562, + "logps/rejected": -174.44102478027344, + "loss": 0.6871, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.018330033868551254, + "rewards/margins": 0.1153540164232254, + "rewards/rejected": -0.09702397882938385, + "step": 5790 + }, + { + "epoch": 0.38, + "learning_rate": 3.901924499309564e-06, + "logits/chosen": -2.198864698410034, + "logits/rejected": -2.009559154510498, + "logps/chosen": -229.9792022705078, + "logps/rejected": -206.246337890625, + "loss": 0.69, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012742845341563225, + "rewards/margins": 0.08873111009597778, + "rewards/rejected": -0.10147394984960556, + "step": 5800 + }, + { + "epoch": 0.38, + "eval_logits/chosen": -2.2906556129455566, + "eval_logits/rejected": -2.104820966720581, + "eval_logps/chosen": -233.96670532226562, + "eval_logps/rejected": -222.71261596679688, + "eval_loss": 0.6902094483375549, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": -0.019617412239313126, + "eval_rewards/margins": 0.09138944000005722, + "eval_rewards/rejected": -0.11100686341524124, + "eval_runtime": 710.6333, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 5800 + }, + { + "epoch": 0.38, + "learning_rate": 3.897193202379575e-06, + "logits/chosen": -2.2877416610717773, + "logits/rejected": -2.117922306060791, + "logps/chosen": -201.8192596435547, + "logps/rejected": -193.40383911132812, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.013415491208434105, + "rewards/margins": 0.0855737030506134, + "rewards/rejected": -0.09898919612169266, + "step": 5810 + }, + { + "epoch": 0.38, + "learning_rate": 3.8924546170054215e-06, + "logits/chosen": -2.2300517559051514, + "logits/rejected": -2.1765639781951904, + "logps/chosen": -216.3985595703125, + "logps/rejected": -204.2929229736328, + "loss": 0.6908, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00490927416831255, + "rewards/margins": 0.0736776739358902, + "rewards/rejected": -0.07858695089817047, + "step": 5820 + }, + { + "epoch": 0.38, + "learning_rate": 3.887708767905883e-06, + "logits/chosen": -2.492140054702759, + "logits/rejected": -2.089107036590576, + "logps/chosen": -243.9584503173828, + "logps/rejected": -184.98101806640625, + "loss": 0.6922, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.001912574516609311, + "rewards/margins": 0.0686551108956337, + "rewards/rejected": -0.07056768238544464, + "step": 5830 + }, + { + "epoch": 0.38, + "learning_rate": 3.882955679837636e-06, + "logits/chosen": -2.250488758087158, + "logits/rejected": -2.1399431228637695, + "logps/chosen": -238.8167724609375, + "logps/rejected": -247.60751342773438, + "loss": 0.6914, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.005602418445050716, + "rewards/margins": 0.06415437906980515, + "rewards/rejected": -0.058551959693431854, + "step": 5840 + }, + { + "epoch": 0.38, + "learning_rate": 3.878195377595113e-06, + "logits/chosen": -2.3308000564575195, + "logits/rejected": -2.1495823860168457, + "logps/chosen": -235.9189453125, + "logps/rejected": -239.89706420898438, + "loss": 0.6902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00013175308413337916, + "rewards/margins": 0.09541679173707962, + "rewards/rejected": -0.09554854780435562, + "step": 5850 + }, + { + "epoch": 0.38, + "learning_rate": 3.873427886010384e-06, + "logits/chosen": -2.276776075363159, + "logits/rejected": -2.1244194507598877, + "logps/chosen": -189.79672241210938, + "logps/rejected": -179.958740234375, + "loss": 0.6886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.006552155129611492, + "rewards/margins": 0.09251097589731216, + "rewards/rejected": -0.0859588235616684, + "step": 5860 + }, + { + "epoch": 0.38, + "learning_rate": 3.868653229953021e-06, + "logits/chosen": -2.3475050926208496, + "logits/rejected": -2.118560314178467, + "logps/chosen": -235.6221923828125, + "logps/rejected": -243.8154754638672, + "loss": 0.6882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.015926335006952286, + "rewards/margins": 0.10257148742675781, + "rewards/rejected": -0.08664515614509583, + "step": 5870 + }, + { + "epoch": 0.38, + "learning_rate": 3.8638714343299675e-06, + "logits/chosen": -2.2526631355285645, + "logits/rejected": -2.1661479473114014, + "logps/chosen": -218.9933319091797, + "logps/rejected": -241.96755981445312, + "loss": 0.6876, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.006506229750812054, + "rewards/margins": 0.0785847157239914, + "rewards/rejected": -0.08509095013141632, + "step": 5880 + }, + { + "epoch": 0.39, + "learning_rate": 3.859082524085414e-06, + "logits/chosen": -2.2656807899475098, + "logits/rejected": -1.9120867252349854, + "logps/chosen": -273.0644226074219, + "logps/rejected": -219.8885498046875, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0072127147577703, + "rewards/margins": 0.07063201814889908, + "rewards/rejected": -0.07784473150968552, + "step": 5890 + }, + { + "epoch": 0.39, + "learning_rate": 3.854286524200659e-06, + "logits/chosen": -2.423177480697632, + "logits/rejected": -2.148200511932373, + "logps/chosen": -276.10162353515625, + "logps/rejected": -229.82730102539062, + "loss": 0.6907, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0031580955255776644, + "rewards/margins": 0.04362088814377785, + "rewards/rejected": -0.04046279191970825, + "step": 5900 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -2.2796647548675537, + "eval_logits/rejected": -2.094618797302246, + "eval_logps/chosen": -231.79417419433594, + "eval_logps/rejected": -219.75482177734375, + "eval_loss": 0.6900690793991089, + "eval_rewards/accuracies": 0.6384999752044678, + "eval_rewards/chosen": 0.002107798121869564, + "eval_rewards/margins": 0.08353700488805771, + "eval_rewards/rejected": -0.08142919838428497, + "eval_runtime": 710.6336, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 5900 + }, + { + "epoch": 0.39, + "learning_rate": 3.849483459693991e-06, + "logits/chosen": -2.361053943634033, + "logits/rejected": -2.2258360385894775, + "logps/chosen": -208.7968292236328, + "logps/rejected": -182.0562744140625, + "loss": 0.6855, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.008798656985163689, + "rewards/margins": 0.11928577721118927, + "rewards/rejected": -0.11048711836338043, + "step": 5910 + }, + { + "epoch": 0.39, + "learning_rate": 3.844673355620544e-06, + "logits/chosen": -2.253052234649658, + "logits/rejected": -2.1112794876098633, + "logps/chosen": -245.8029022216797, + "logps/rejected": -217.4182891845703, + "loss": 0.6905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.007700022310018539, + "rewards/margins": 0.09900878369808197, + "rewards/rejected": -0.09130875021219254, + "step": 5920 + }, + { + "epoch": 0.39, + "learning_rate": 3.839856237072178e-06, + "logits/chosen": -2.0936896800994873, + "logits/rejected": -2.091200351715088, + "logps/chosen": -185.02664184570312, + "logps/rejected": -212.76174926757812, + "loss": 0.6887, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.024525459855794907, + "rewards/margins": 0.10542996972799301, + "rewards/rejected": -0.1299554407596588, + "step": 5930 + }, + { + "epoch": 0.39, + "learning_rate": 3.8350321291773455e-06, + "logits/chosen": -2.0315639972686768, + "logits/rejected": -1.9680635929107666, + "logps/chosen": -200.049560546875, + "logps/rejected": -164.24830627441406, + "loss": 0.6924, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02472389116883278, + "rewards/margins": 0.08395050466060638, + "rewards/rejected": -0.0592266209423542, + "step": 5940 + }, + { + "epoch": 0.39, + "learning_rate": 3.830201057100953e-06, + "logits/chosen": -2.3199234008789062, + "logits/rejected": -2.351250171661377, + "logps/chosen": -190.0390167236328, + "logps/rejected": -215.13748168945312, + "loss": 0.6889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01733965426683426, + "rewards/margins": 0.0960569754242897, + "rewards/rejected": -0.07871732115745544, + "step": 5950 + }, + { + "epoch": 0.39, + "learning_rate": 3.82536304604424e-06, + "logits/chosen": -2.111283302307129, + "logits/rejected": -2.055291175842285, + "logps/chosen": -232.94387817382812, + "logps/rejected": -208.7888641357422, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.032203659415245056, + "rewards/margins": 0.075811468064785, + "rewards/rejected": -0.04360780864953995, + "step": 5960 + }, + { + "epoch": 0.39, + "learning_rate": 3.8205181212446435e-06, + "logits/chosen": -2.488105297088623, + "logits/rejected": -2.2576441764831543, + "logps/chosen": -266.7863464355469, + "logps/rejected": -231.9990997314453, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03772572800517082, + "rewards/margins": 0.05616292357444763, + "rewards/rejected": -0.01843719184398651, + "step": 5970 + }, + { + "epoch": 0.39, + "learning_rate": 3.815666307975664e-06, + "logits/chosen": -2.29557466506958, + "logits/rejected": -2.167210102081299, + "logps/chosen": -235.0832061767578, + "logps/rejected": -221.5277862548828, + "loss": 0.6921, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.022346094250679016, + "rewards/margins": 0.05626540258526802, + "rewards/rejected": -0.033919308334589005, + "step": 5980 + }, + { + "epoch": 0.39, + "learning_rate": 3.8108076315467346e-06, + "logits/chosen": -2.3752923011779785, + "logits/rejected": -2.240790367126465, + "logps/chosen": -257.206787109375, + "logps/rejected": -189.83230590820312, + "loss": 0.6912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0007598648080602288, + "rewards/margins": 0.0735207051038742, + "rewards/rejected": -0.07276083528995514, + "step": 5990 + }, + { + "epoch": 0.39, + "learning_rate": 3.805942117303093e-06, + "logits/chosen": -2.5641138553619385, + "logits/rejected": -2.226461887359619, + "logps/chosen": -314.4233093261719, + "logps/rejected": -274.7497863769531, + "loss": 0.6901, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.016122477129101753, + "rewards/margins": 0.06850017607212067, + "rewards/rejected": -0.05237768962979317, + "step": 6000 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -2.311023235321045, + "eval_logits/rejected": -2.123633861541748, + "eval_logps/chosen": -231.44161987304688, + "eval_logps/rejected": -218.17413330078125, + "eval_loss": 0.6901270747184753, + "eval_rewards/accuracies": 0.6294999718666077, + "eval_rewards/chosen": 0.005633320193737745, + "eval_rewards/margins": 0.07125571370124817, + "eval_rewards/rejected": -0.06562238931655884, + "eval_runtime": 710.6252, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 6000 + }, + { + "epoch": 0.39, + "learning_rate": 3.8010697906256446e-06, + "logits/chosen": -2.1471753120422363, + "logits/rejected": -2.135615348815918, + "logps/chosen": -210.50021362304688, + "logps/rejected": -188.69100952148438, + "loss": 0.6953, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.038038115948438644, + "rewards/margins": 0.06020959094166756, + "rewards/rejected": -0.0982476994395256, + "step": 6010 + }, + { + "epoch": 0.39, + "learning_rate": 3.7961906769308323e-06, + "logits/chosen": -2.19319486618042, + "logits/rejected": -2.009164571762085, + "logps/chosen": -206.8964385986328, + "logps/rejected": -224.5199737548828, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.006612460128962994, + "rewards/margins": 0.0754861980676651, + "rewards/rejected": -0.08209865540266037, + "step": 6020 + }, + { + "epoch": 0.39, + "learning_rate": 3.7913048016705028e-06, + "logits/chosen": -2.2544267177581787, + "logits/rejected": -2.210512638092041, + "logps/chosen": -259.06982421875, + "logps/rejected": -262.7820739746094, + "loss": 0.6904, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.008697474375367165, + "rewards/margins": 0.057602547109127045, + "rewards/rejected": -0.048905082046985626, + "step": 6030 + }, + { + "epoch": 0.4, + "learning_rate": 3.786412190331775e-06, + "logits/chosen": -2.447171211242676, + "logits/rejected": -2.118438720703125, + "logps/chosen": -201.5836944580078, + "logps/rejected": -172.86338806152344, + "loss": 0.6898, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.012768149375915527, + "rewards/margins": 0.08379059284925461, + "rewards/rejected": -0.07102244347333908, + "step": 6040 + }, + { + "epoch": 0.4, + "learning_rate": 3.781512868436906e-06, + "logits/chosen": -2.435929775238037, + "logits/rejected": -2.2919459342956543, + "logps/chosen": -129.0099639892578, + "logps/rejected": -143.07064819335938, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03600788861513138, + "rewards/margins": 0.0627172440290451, + "rewards/rejected": -0.02670934796333313, + "step": 6050 + }, + { + "epoch": 0.4, + "learning_rate": 3.7766068615431605e-06, + "logits/chosen": -2.2673373222351074, + "logits/rejected": -2.2142586708068848, + "logps/chosen": -254.04931640625, + "logps/rejected": -212.45474243164062, + "loss": 0.692, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.030377531424164772, + "rewards/margins": 0.07040407508611679, + "rewards/rejected": -0.04002653807401657, + "step": 6060 + }, + { + "epoch": 0.4, + "learning_rate": 3.771694195242671e-06, + "logits/chosen": -2.4878172874450684, + "logits/rejected": -2.041779041290283, + "logps/chosen": -297.07598876953125, + "logps/rejected": -191.4386444091797, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.021027540788054466, + "rewards/margins": 0.06093548983335495, + "rewards/rejected": -0.039907947182655334, + "step": 6070 + }, + { + "epoch": 0.4, + "learning_rate": 3.766774895162314e-06, + "logits/chosen": -2.3142666816711426, + "logits/rejected": -2.210665225982666, + "logps/chosen": -244.95858764648438, + "logps/rejected": -193.0878143310547, + "loss": 0.6915, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.015811407938599586, + "rewards/margins": 0.04264925420284271, + "rewards/rejected": -0.026837846264243126, + "step": 6080 + }, + { + "epoch": 0.4, + "learning_rate": 3.7618489869635666e-06, + "logits/chosen": -2.227144479751587, + "logits/rejected": -2.172423839569092, + "logps/chosen": -247.10653686523438, + "logps/rejected": -232.07986450195312, + "loss": 0.6924, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.000796070322394371, + "rewards/margins": 0.04716240614652634, + "rewards/rejected": -0.04795847460627556, + "step": 6090 + }, + { + "epoch": 0.4, + "learning_rate": 3.756916496342379e-06, + "logits/chosen": -2.2466297149658203, + "logits/rejected": -2.30493426322937, + "logps/chosen": -184.02410888671875, + "logps/rejected": -198.5399932861328, + "loss": 0.6889, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028477992862462997, + "rewards/margins": 0.060630954802036285, + "rewards/rejected": -0.03215295821428299, + "step": 6100 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -2.3196463584899902, + "eval_logits/rejected": -2.131615161895752, + "eval_logps/chosen": -228.611572265625, + "eval_logps/rejected": -215.37451171875, + "eval_loss": 0.6901247501373291, + "eval_rewards/accuracies": 0.6215000152587891, + "eval_rewards/chosen": 0.0339338555932045, + "eval_rewards/margins": 0.0715600922703743, + "eval_rewards/rejected": -0.0376262404024601, + "eval_runtime": 710.3418, + "eval_samples_per_second": 2.816, + "eval_steps_per_second": 1.408, + "step": 6100 + }, + { + "epoch": 0.4, + "learning_rate": 3.751977449029039e-06, + "logits/chosen": -1.9810413122177124, + "logits/rejected": -1.983925461769104, + "logps/chosen": -258.67047119140625, + "logps/rejected": -227.87368774414062, + "loss": 0.6919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.014060038141906261, + "rewards/margins": 0.06598879396915436, + "rewards/rejected": -0.05192875862121582, + "step": 6110 + }, + { + "epoch": 0.4, + "learning_rate": 3.747031870788037e-06, + "logits/chosen": -2.4974024295806885, + "logits/rejected": -2.138140916824341, + "logps/chosen": -313.6204833984375, + "logps/rejected": -238.707763671875, + "loss": 0.6904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03132764250040054, + "rewards/margins": 0.07506566494703293, + "rewards/rejected": -0.043738026171922684, + "step": 6120 + }, + { + "epoch": 0.4, + "learning_rate": 3.7420797874179326e-06, + "logits/chosen": -2.277357578277588, + "logits/rejected": -2.018991708755493, + "logps/chosen": -236.95089721679688, + "logps/rejected": -186.17581176757812, + "loss": 0.6896, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0015857883263379335, + "rewards/margins": 0.08113957941532135, + "rewards/rejected": -0.07955377548933029, + "step": 6130 + }, + { + "epoch": 0.4, + "learning_rate": 3.7371212247512167e-06, + "logits/chosen": -2.6132965087890625, + "logits/rejected": -2.2684082984924316, + "logps/chosen": -322.7628173828125, + "logps/rejected": -267.7512512207031, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03631674125790596, + "rewards/margins": 0.10096652805805206, + "rewards/rejected": -0.0646497905254364, + "step": 6140 + }, + { + "epoch": 0.4, + "learning_rate": 3.7321562086541817e-06, + "logits/chosen": -2.3771257400512695, + "logits/rejected": -2.2703232765197754, + "logps/chosen": -251.13504028320312, + "logps/rejected": -260.8123474121094, + "loss": 0.6908, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.01845073141157627, + "rewards/margins": 0.08887068182229996, + "rewards/rejected": -0.07041995227336884, + "step": 6150 + }, + { + "epoch": 0.4, + "learning_rate": 3.7271847650267834e-06, + "logits/chosen": -2.168355703353882, + "logits/rejected": -2.0670669078826904, + "logps/chosen": -199.58877563476562, + "logps/rejected": -204.58006286621094, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003458759281784296, + "rewards/margins": 0.058815740048885345, + "rewards/rejected": -0.05535699054598808, + "step": 6160 + }, + { + "epoch": 0.4, + "learning_rate": 3.7222069198025086e-06, + "logits/chosen": -2.1603808403015137, + "logits/rejected": -1.9670292139053345, + "logps/chosen": -209.85214233398438, + "logps/rejected": -197.63839721679688, + "loss": 0.6892, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.013096879236400127, + "rewards/margins": 0.08378522843122482, + "rewards/rejected": -0.09688210487365723, + "step": 6170 + }, + { + "epoch": 0.4, + "learning_rate": 3.7172226989482353e-06, + "logits/chosen": -2.1758627891540527, + "logits/rejected": -1.9652674198150635, + "logps/chosen": -209.81430053710938, + "logps/rejected": -212.7050323486328, + "loss": 0.6916, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0015852168435230851, + "rewards/margins": 0.05035899206995964, + "rewards/rejected": -0.04877377673983574, + "step": 6180 + }, + { + "epoch": 0.4, + "learning_rate": 3.7122321284641007e-06, + "logits/chosen": -2.4331085681915283, + "logits/rejected": -1.9999637603759766, + "logps/chosen": -363.33984375, + "logps/rejected": -262.52734375, + "loss": 0.6887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0055319941602647305, + "rewards/margins": 0.09677392989397049, + "rewards/rejected": -0.10230592638254166, + "step": 6190 + }, + { + "epoch": 0.41, + "learning_rate": 3.707235234383365e-06, + "logits/chosen": -2.3560452461242676, + "logits/rejected": -2.0064382553100586, + "logps/chosen": -258.49041748046875, + "logps/rejected": -178.62417602539062, + "loss": 0.691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.027635321021080017, + "rewards/margins": 0.06912614405155182, + "rewards/rejected": -0.0414908304810524, + "step": 6200 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.3145864009857178, + "eval_logits/rejected": -2.126389265060425, + "eval_logps/chosen": -229.6930694580078, + "eval_logps/rejected": -217.35781860351562, + "eval_loss": 0.689987301826477, + "eval_rewards/accuracies": 0.6284999847412109, + "eval_rewards/chosen": 0.023118959739804268, + "eval_rewards/margins": 0.08057821542024612, + "eval_rewards/rejected": -0.05745925009250641, + "eval_runtime": 712.4839, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 6200 + }, + { + "epoch": 0.41, + "learning_rate": 3.702232042772277e-06, + "logits/chosen": -2.1632602214813232, + "logits/rejected": -2.1070027351379395, + "logps/chosen": -201.1727294921875, + "logps/rejected": -189.2860565185547, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013734388165175915, + "rewards/margins": 0.10467977821826935, + "rewards/rejected": -0.09094538539648056, + "step": 6210 + }, + { + "epoch": 0.41, + "learning_rate": 3.6972225797299325e-06, + "logits/chosen": -2.2812116146087646, + "logits/rejected": -2.31536602973938, + "logps/chosen": -251.81900024414062, + "logps/rejected": -243.4286651611328, + "loss": 0.6881, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.013103622011840343, + "rewards/margins": 0.07545242458581924, + "rewards/rejected": -0.08855602890253067, + "step": 6220 + }, + { + "epoch": 0.41, + "learning_rate": 3.692206871388147e-06, + "logits/chosen": -2.3902740478515625, + "logits/rejected": -1.9351933002471924, + "logps/chosen": -232.5216827392578, + "logps/rejected": -202.56259155273438, + "loss": 0.6897, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0013294353848323226, + "rewards/margins": 0.1127009242773056, + "rewards/rejected": -0.11403036117553711, + "step": 6230 + }, + { + "epoch": 0.41, + "learning_rate": 3.6871849439113115e-06, + "logits/chosen": -2.090280294418335, + "logits/rejected": -2.0177419185638428, + "logps/chosen": -228.4338836669922, + "logps/rejected": -225.20547485351562, + "loss": 0.6905, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.014769596047699451, + "rewards/margins": 0.0631939023733139, + "rewards/rejected": -0.048424310982227325, + "step": 6240 + }, + { + "epoch": 0.41, + "learning_rate": 3.682156823496259e-06, + "logits/chosen": -2.3378641605377197, + "logits/rejected": -2.049996852874756, + "logps/chosen": -209.41159057617188, + "logps/rejected": -187.8102569580078, + "loss": 0.6914, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.045023590326309204, + "rewards/margins": 0.10764230787754059, + "rewards/rejected": -0.06261870265007019, + "step": 6250 + }, + { + "epoch": 0.41, + "learning_rate": 3.67712253637213e-06, + "logits/chosen": -2.363736629486084, + "logits/rejected": -2.1265621185302734, + "logps/chosen": -291.31158447265625, + "logps/rejected": -208.5716552734375, + "loss": 0.6895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.009828880429267883, + "rewards/margins": 0.07172620296478271, + "rewards/rejected": -0.06189732626080513, + "step": 6260 + }, + { + "epoch": 0.41, + "learning_rate": 3.672082108800231e-06, + "logits/chosen": -2.1761693954467773, + "logits/rejected": -1.9752609729766846, + "logps/chosen": -216.0406951904297, + "logps/rejected": -189.57601928710938, + "loss": 0.6902, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.022248603403568268, + "rewards/margins": 0.07574795186519623, + "rewards/rejected": -0.0979965478181839, + "step": 6270 + }, + { + "epoch": 0.41, + "learning_rate": 3.6670355670739012e-06, + "logits/chosen": -2.2948927879333496, + "logits/rejected": -2.105381488800049, + "logps/chosen": -160.06436157226562, + "logps/rejected": -167.32986450195312, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.018890656530857086, + "rewards/margins": 0.09904204308986664, + "rewards/rejected": -0.08015139400959015, + "step": 6280 + }, + { + "epoch": 0.41, + "learning_rate": 3.6619829375183745e-06, + "logits/chosen": -2.451979160308838, + "logits/rejected": -2.256770610809326, + "logps/chosen": -223.16323852539062, + "logps/rejected": -218.20315551757812, + "loss": 0.6887, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02839289978146553, + "rewards/margins": 0.11110793054103851, + "rewards/rejected": -0.08271503448486328, + "step": 6290 + }, + { + "epoch": 0.41, + "learning_rate": 3.6569242464906427e-06, + "logits/chosen": -2.307574987411499, + "logits/rejected": -2.1314024925231934, + "logps/chosen": -202.07757568359375, + "logps/rejected": -233.34414672851562, + "loss": 0.6871, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05651476979255676, + "rewards/margins": 0.10217426717281342, + "rewards/rejected": -0.04565950110554695, + "step": 6300 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.3068814277648926, + "eval_logits/rejected": -2.120007276535034, + "eval_logps/chosen": -227.68896484375, + "eval_logps/rejected": -215.39700317382812, + "eval_loss": 0.6900160312652588, + "eval_rewards/accuracies": 0.6370000243186951, + "eval_rewards/chosen": 0.04315978288650513, + "eval_rewards/margins": 0.0810108482837677, + "eval_rewards/rejected": -0.03785106539726257, + "eval_runtime": 711.6068, + "eval_samples_per_second": 2.811, + "eval_steps_per_second": 1.405, + "step": 6300 + }, + { + "epoch": 0.41, + "learning_rate": 3.6518595203793156e-06, + "logits/chosen": -2.1580138206481934, + "logits/rejected": -2.1727097034454346, + "logps/chosen": -251.7514190673828, + "logps/rejected": -275.81744384765625, + "loss": 0.6906, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.06962844729423523, + "rewards/margins": 0.10591252893209457, + "rewards/rejected": -0.03628408536314964, + "step": 6310 + }, + { + "epoch": 0.41, + "learning_rate": 3.646788785604485e-06, + "logits/chosen": -2.288708209991455, + "logits/rejected": -2.1455512046813965, + "logps/chosen": -199.6431884765625, + "logps/rejected": -203.20138549804688, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.060151923447847366, + "rewards/margins": 0.05803212523460388, + "rewards/rejected": 0.002119800541549921, + "step": 6320 + }, + { + "epoch": 0.41, + "learning_rate": 3.641712068617588e-06, + "logits/chosen": -2.30918550491333, + "logits/rejected": -2.1908669471740723, + "logps/chosen": -255.9428253173828, + "logps/rejected": -208.373046875, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.054172348231077194, + "rewards/margins": 0.05974752828478813, + "rewards/rejected": -0.005575177259743214, + "step": 6330 + }, + { + "epoch": 0.41, + "learning_rate": 3.6366293959012673e-06, + "logits/chosen": -2.25718355178833, + "logits/rejected": -2.020131826400757, + "logps/chosen": -176.4119415283203, + "logps/rejected": -165.9830780029297, + "loss": 0.6883, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.04790060594677925, + "rewards/margins": 0.09094378352165222, + "rewards/rejected": -0.04304318130016327, + "step": 6340 + }, + { + "epoch": 0.42, + "learning_rate": 3.631540793969233e-06, + "logits/chosen": -2.403151035308838, + "logits/rejected": -2.379413604736328, + "logps/chosen": -183.47628784179688, + "logps/rejected": -190.08969116210938, + "loss": 0.6908, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.051414500921964645, + "rewards/margins": 0.04935113340616226, + "rewards/rejected": 0.002063371241092682, + "step": 6350 + }, + { + "epoch": 0.42, + "learning_rate": 3.626446289366127e-06, + "logits/chosen": -2.583311080932617, + "logits/rejected": -2.1373889446258545, + "logps/chosen": -218.4346160888672, + "logps/rejected": -150.66162109375, + "loss": 0.6923, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01714567467570305, + "rewards/margins": 0.037458065897226334, + "rewards/rejected": -0.020312385633587837, + "step": 6360 + }, + { + "epoch": 0.42, + "learning_rate": 3.6213459086673786e-06, + "logits/chosen": -2.3131918907165527, + "logits/rejected": -2.375338315963745, + "logps/chosen": -168.5972900390625, + "logps/rejected": -186.1190185546875, + "loss": 0.6881, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02340294048190117, + "rewards/margins": 0.08148431777954102, + "rewards/rejected": -0.05808137729763985, + "step": 6370 + }, + { + "epoch": 0.42, + "learning_rate": 3.6162396784790737e-06, + "logits/chosen": -2.1972436904907227, + "logits/rejected": -2.225222110748291, + "logps/chosen": -221.7857666015625, + "logps/rejected": -227.5126495361328, + "loss": 0.6905, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.012212952598929405, + "rewards/margins": 0.058864694088697433, + "rewards/rejected": -0.04665173962712288, + "step": 6380 + }, + { + "epoch": 0.42, + "learning_rate": 3.6111276254378095e-06, + "logits/chosen": -2.2998032569885254, + "logits/rejected": -2.295097827911377, + "logps/chosen": -222.982177734375, + "logps/rejected": -229.3250732421875, + "loss": 0.689, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04974722862243652, + "rewards/margins": 0.10756466537714005, + "rewards/rejected": -0.05781743675470352, + "step": 6390 + }, + { + "epoch": 0.42, + "learning_rate": 3.606009776210559e-06, + "logits/chosen": -2.290212392807007, + "logits/rejected": -2.0554823875427246, + "logps/chosen": -239.64724731445312, + "logps/rejected": -205.0599365234375, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009764440357685089, + "rewards/margins": 0.09071458876132965, + "rewards/rejected": -0.08095015585422516, + "step": 6400 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -2.3204777240753174, + "eval_logits/rejected": -2.1320362091064453, + "eval_logps/chosen": -229.05621337890625, + "eval_logps/rejected": -217.79949951171875, + "eval_loss": 0.6900865435600281, + "eval_rewards/accuracies": 0.6309999823570251, + "eval_rewards/chosen": 0.02948746271431446, + "eval_rewards/margins": 0.09136352688074112, + "eval_rewards/rejected": -0.06187606602907181, + "eval_runtime": 713.0231, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.402, + "step": 6400 + }, + { + "epoch": 0.42, + "learning_rate": 3.600886157494531e-06, + "logits/chosen": -2.4270999431610107, + "logits/rejected": -2.302009105682373, + "logps/chosen": -260.2608642578125, + "logps/rejected": -257.4969787597656, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025956381112337112, + "rewards/margins": 0.09018175303936005, + "rewards/rejected": -0.06422537565231323, + "step": 6410 + }, + { + "epoch": 0.42, + "learning_rate": 3.5957567960170304e-06, + "logits/chosen": -2.5143160820007324, + "logits/rejected": -1.7987537384033203, + "logps/chosen": -286.97247314453125, + "logps/rejected": -178.82583618164062, + "loss": 0.6887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04212607815861702, + "rewards/margins": 0.10511051118373871, + "rewards/rejected": -0.06298444420099258, + "step": 6420 + }, + { + "epoch": 0.42, + "learning_rate": 3.590621718535319e-06, + "logits/chosen": -2.1314139366149902, + "logits/rejected": -1.941144347190857, + "logps/chosen": -197.22164916992188, + "logps/rejected": -206.35385131835938, + "loss": 0.6883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.004992819391191006, + "rewards/margins": 0.100247822701931, + "rewards/rejected": -0.10524062812328339, + "step": 6430 + }, + { + "epoch": 0.42, + "learning_rate": 3.5854809518364775e-06, + "logits/chosen": -2.3986408710479736, + "logits/rejected": -2.087287425994873, + "logps/chosen": -241.9314422607422, + "logps/rejected": -209.21890258789062, + "loss": 0.6875, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04215322062373161, + "rewards/margins": 0.11800198256969452, + "rewards/rejected": -0.07584875077009201, + "step": 6440 + }, + { + "epoch": 0.42, + "learning_rate": 3.580334522737262e-06, + "logits/chosen": -2.312293529510498, + "logits/rejected": -2.025383472442627, + "logps/chosen": -197.6604766845703, + "logps/rejected": -173.70423889160156, + "loss": 0.6907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04283946007490158, + "rewards/margins": 0.09693354368209839, + "rewards/rejected": -0.05409408360719681, + "step": 6450 + }, + { + "epoch": 0.42, + "learning_rate": 3.575182458083968e-06, + "logits/chosen": -2.2589616775512695, + "logits/rejected": -2.2055575847625732, + "logps/chosen": -239.0443878173828, + "logps/rejected": -221.5413818359375, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.017606602981686592, + "rewards/margins": 0.10502012073993683, + "rewards/rejected": -0.08741351217031479, + "step": 6460 + }, + { + "epoch": 0.42, + "learning_rate": 3.5700247847522883e-06, + "logits/chosen": -2.358262062072754, + "logits/rejected": -2.2695822715759277, + "logps/chosen": -197.18240356445312, + "logps/rejected": -207.05166625976562, + "loss": 0.6882, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.056155987083911896, + "rewards/margins": 0.09887482225894928, + "rewards/rejected": -0.04271883890032768, + "step": 6470 + }, + { + "epoch": 0.42, + "learning_rate": 3.5648615296471743e-06, + "logits/chosen": -2.153874635696411, + "logits/rejected": -2.1209194660186768, + "logps/chosen": -193.21182250976562, + "logps/rejected": -241.32669067382812, + "loss": 0.6902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04150991886854172, + "rewards/margins": 0.10041022300720215, + "rewards/rejected": -0.05890030413866043, + "step": 6480 + }, + { + "epoch": 0.42, + "learning_rate": 3.559692719702693e-06, + "logits/chosen": -2.1794726848602295, + "logits/rejected": -1.8666191101074219, + "logps/chosen": -283.8521728515625, + "logps/rejected": -234.9342498779297, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022929811850190163, + "rewards/margins": 0.07811586558818817, + "rewards/rejected": -0.05518605187535286, + "step": 6490 + }, + { + "epoch": 0.43, + "learning_rate": 3.55451838188189e-06, + "logits/chosen": -2.293243885040283, + "logits/rejected": -2.1898462772369385, + "logps/chosen": -257.29473876953125, + "logps/rejected": -287.2796630859375, + "loss": 0.6918, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03992018103599548, + "rewards/margins": 0.08051706850528717, + "rewards/rejected": -0.04059688374400139, + "step": 6500 + }, + { + "epoch": 0.43, + "eval_logits/chosen": -2.3292880058288574, + "eval_logits/rejected": -2.1406772136688232, + "eval_logps/chosen": -229.60731506347656, + "eval_logps/rejected": -217.20216369628906, + "eval_loss": 0.6900708675384521, + "eval_rewards/accuracies": 0.6370000243186951, + "eval_rewards/chosen": 0.023976394906640053, + "eval_rewards/margins": 0.07987881451845169, + "eval_rewards/rejected": -0.05590242147445679, + "eval_runtime": 713.1525, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 6500 + }, + { + "epoch": 0.43, + "learning_rate": 3.549338543176645e-06, + "logits/chosen": -2.3476295471191406, + "logits/rejected": -2.080786943435669, + "logps/chosen": -302.98834228515625, + "logps/rejected": -277.6363220214844, + "loss": 0.6911, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03500208631157875, + "rewards/margins": 0.07001151889562607, + "rewards/rejected": -0.035009440034627914, + "step": 6510 + }, + { + "epoch": 0.43, + "learning_rate": 3.5441532306075342e-06, + "logits/chosen": -2.294619083404541, + "logits/rejected": -2.282924175262451, + "logps/chosen": -231.687255859375, + "logps/rejected": -283.9492492675781, + "loss": 0.6924, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.002680377336218953, + "rewards/margins": 0.056967295706272125, + "rewards/rejected": -0.05964766815304756, + "step": 6520 + }, + { + "epoch": 0.43, + "learning_rate": 3.5389624712236894e-06, + "logits/chosen": -2.295729637145996, + "logits/rejected": -2.0658695697784424, + "logps/chosen": -212.0193328857422, + "logps/rejected": -188.84963989257812, + "loss": 0.6922, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020164761692285538, + "rewards/margins": 0.037581443786621094, + "rewards/rejected": -0.017416680231690407, + "step": 6530 + }, + { + "epoch": 0.43, + "learning_rate": 3.533766292102653e-06, + "logits/chosen": -2.2568297386169434, + "logits/rejected": -2.117800235748291, + "logps/chosen": -210.57046508789062, + "logps/rejected": -203.5366973876953, + "loss": 0.6898, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.007259045727550983, + "rewards/margins": 0.04654930159449577, + "rewards/rejected": -0.053808342665433884, + "step": 6540 + }, + { + "epoch": 0.43, + "learning_rate": 3.5285647203502404e-06, + "logits/chosen": -2.4623608589172363, + "logits/rejected": -2.3327670097351074, + "logps/chosen": -253.4337615966797, + "logps/rejected": -221.22885131835938, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.038372620940208435, + "rewards/margins": 0.055611032992601395, + "rewards/rejected": -0.01723841205239296, + "step": 6550 + }, + { + "epoch": 0.43, + "learning_rate": 3.5233577831003983e-06, + "logits/chosen": -2.287559986114502, + "logits/rejected": -2.138993501663208, + "logps/chosen": -249.0475616455078, + "logps/rejected": -233.41311645507812, + "loss": 0.6888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03128386288881302, + "rewards/margins": 0.08034192025661469, + "rewards/rejected": -0.04905804991722107, + "step": 6560 + }, + { + "epoch": 0.43, + "learning_rate": 3.5181455075150628e-06, + "logits/chosen": -2.2086570262908936, + "logits/rejected": -1.8272384405136108, + "logps/chosen": -180.17086791992188, + "logps/rejected": -139.23220825195312, + "loss": 0.6917, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0177529938519001, + "rewards/margins": 0.05684244632720947, + "rewards/rejected": -0.03908945992588997, + "step": 6570 + }, + { + "epoch": 0.43, + "learning_rate": 3.512927920784016e-06, + "logits/chosen": -2.37443208694458, + "logits/rejected": -2.232402801513672, + "logps/chosen": -221.47036743164062, + "logps/rejected": -214.24496459960938, + "loss": 0.6877, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.042037345468997955, + "rewards/margins": 0.1275465041399002, + "rewards/rejected": -0.08550916612148285, + "step": 6580 + }, + { + "epoch": 0.43, + "learning_rate": 3.5077050501247457e-06, + "logits/chosen": -2.4588403701782227, + "logits/rejected": -2.0066471099853516, + "logps/chosen": -274.39154052734375, + "logps/rejected": -217.39682006835938, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06623397767543793, + "rewards/margins": 0.10393860191106796, + "rewards/rejected": -0.037704624235630035, + "step": 6590 + }, + { + "epoch": 0.43, + "learning_rate": 3.5024769227823042e-06, + "logits/chosen": -2.353419065475464, + "logits/rejected": -2.2162890434265137, + "logps/chosen": -161.44163513183594, + "logps/rejected": -128.65968322753906, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014296752400696278, + "rewards/margins": 0.08015719056129456, + "rewards/rejected": -0.06586043536663055, + "step": 6600 + }, + { + "epoch": 0.43, + "eval_logits/chosen": -2.337346076965332, + "eval_logits/rejected": -2.1480324268341064, + "eval_logps/chosen": -228.5489959716797, + "eval_logps/rejected": -215.88450622558594, + "eval_loss": 0.6900503039360046, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": 0.03455958515405655, + "eval_rewards/margins": 0.07728561758995056, + "eval_rewards/rejected": -0.042726028710603714, + "eval_runtime": 711.4362, + "eval_samples_per_second": 2.811, + "eval_steps_per_second": 1.406, + "step": 6600 + }, + { + "epoch": 0.43, + "learning_rate": 3.4972435660291646e-06, + "logits/chosen": -2.4253835678100586, + "logits/rejected": -2.281532049179077, + "logps/chosen": -240.89852905273438, + "logps/rejected": -223.0607452392578, + "loss": 0.6903, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.022457608953118324, + "rewards/margins": 0.07101938128471375, + "rewards/rejected": -0.04856177791953087, + "step": 6610 + }, + { + "epoch": 0.43, + "learning_rate": 3.492005007165079e-06, + "logits/chosen": -2.2975025177001953, + "logits/rejected": -2.044618844985962, + "logps/chosen": -224.947265625, + "logps/rejected": -237.4292449951172, + "loss": 0.6897, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.04589748755097389, + "rewards/margins": 0.06325410306453705, + "rewards/rejected": -0.017356622964143753, + "step": 6620 + }, + { + "epoch": 0.43, + "learning_rate": 3.4867612735169377e-06, + "logits/chosen": -2.4469258785247803, + "logits/rejected": -2.0971310138702393, + "logps/chosen": -220.57882690429688, + "logps/rejected": -151.8156280517578, + "loss": 0.6882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.044555455446243286, + "rewards/margins": 0.09465045481920242, + "rewards/rejected": -0.05009499937295914, + "step": 6630 + }, + { + "epoch": 0.43, + "learning_rate": 3.4815123924386226e-06, + "logits/chosen": -2.609863758087158, + "logits/rejected": -2.2980103492736816, + "logps/chosen": -301.5444030761719, + "logps/rejected": -238.43344116210938, + "loss": 0.6907, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.04996975138783455, + "rewards/margins": 0.06561549007892609, + "rewards/rejected": -0.015645746141672134, + "step": 6640 + }, + { + "epoch": 0.44, + "learning_rate": 3.4762583913108696e-06, + "logits/chosen": -2.1854515075683594, + "logits/rejected": -1.9473320245742798, + "logps/chosen": -263.75238037109375, + "logps/rejected": -230.9041290283203, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.06005977839231491, + "rewards/margins": 0.08066942542791367, + "rewards/rejected": -0.020609647035598755, + "step": 6650 + }, + { + "epoch": 0.44, + "learning_rate": 3.4709992975411217e-06, + "logits/chosen": -2.3034565448760986, + "logits/rejected": -1.8144235610961914, + "logps/chosen": -250.67794799804688, + "logps/rejected": -206.841796875, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04281472787261009, + "rewards/margins": 0.08430864661931992, + "rewards/rejected": -0.04149392247200012, + "step": 6660 + }, + { + "epoch": 0.44, + "learning_rate": 3.4657351385633886e-06, + "logits/chosen": -2.424379825592041, + "logits/rejected": -2.0678696632385254, + "logps/chosen": -180.195068359375, + "logps/rejected": -179.3686065673828, + "loss": 0.6858, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05095613747835159, + "rewards/margins": 0.11373704671859741, + "rewards/rejected": -0.06278089433908463, + "step": 6670 + }, + { + "epoch": 0.44, + "learning_rate": 3.4604659418381024e-06, + "logits/chosen": -2.3288955688476562, + "logits/rejected": -1.9227346181869507, + "logps/chosen": -218.0121612548828, + "logps/rejected": -198.50668334960938, + "loss": 0.69, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0004214264336042106, + "rewards/margins": 0.10025143623352051, + "rewards/rejected": -0.09983000159263611, + "step": 6680 + }, + { + "epoch": 0.44, + "learning_rate": 3.4551917348519744e-06, + "logits/chosen": -2.4326155185699463, + "logits/rejected": -2.1617472171783447, + "logps/chosen": -278.9857482910156, + "logps/rejected": -239.697021484375, + "loss": 0.6894, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.025129491463303566, + "rewards/margins": 0.07911841571331024, + "rewards/rejected": -0.053988922387361526, + "step": 6690 + }, + { + "epoch": 0.44, + "learning_rate": 3.4499125451178505e-06, + "logits/chosen": -2.010340929031372, + "logits/rejected": -2.0988070964813232, + "logps/chosen": -204.6190643310547, + "logps/rejected": -235.9315948486328, + "loss": 0.6914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.003277752548456192, + "rewards/margins": 0.06699816882610321, + "rewards/rejected": -0.0702759176492691, + "step": 6700 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -2.3288984298706055, + "eval_logits/rejected": -2.1398680210113525, + "eval_logps/chosen": -231.943115234375, + "eval_logps/rejected": -220.5701446533203, + "eval_loss": 0.6900596022605896, + "eval_rewards/accuracies": 0.6384999752044678, + "eval_rewards/chosen": 0.0006183562218211591, + "eval_rewards/margins": 0.09020071476697922, + "eval_rewards/rejected": -0.08958234637975693, + "eval_runtime": 710.1788, + "eval_samples_per_second": 2.816, + "eval_steps_per_second": 1.408, + "step": 6700 + }, + { + "epoch": 0.44, + "learning_rate": 3.4446284001745723e-06, + "logits/chosen": -2.14957857131958, + "logits/rejected": -1.8969390392303467, + "logps/chosen": -210.86453247070312, + "logps/rejected": -234.8885955810547, + "loss": 0.6906, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.046308983117341995, + "rewards/margins": 0.07277072966098785, + "rewards/rejected": -0.11907969415187836, + "step": 6710 + }, + { + "epoch": 0.44, + "learning_rate": 3.439339327586827e-06, + "logits/chosen": -2.2965493202209473, + "logits/rejected": -2.2393410205841064, + "logps/chosen": -188.0366973876953, + "logps/rejected": -191.59263610839844, + "loss": 0.6884, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.019810201600193977, + "rewards/margins": 0.08919601142406464, + "rewards/rejected": -0.06938581168651581, + "step": 6720 + }, + { + "epoch": 0.44, + "learning_rate": 3.434045354945008e-06, + "logits/chosen": -2.3867483139038086, + "logits/rejected": -2.1257870197296143, + "logps/chosen": -275.6525573730469, + "logps/rejected": -277.5914611816406, + "loss": 0.6907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0162797961384058, + "rewards/margins": 0.06674771010875702, + "rewards/rejected": -0.08302750438451767, + "step": 6730 + }, + { + "epoch": 0.44, + "learning_rate": 3.4287465098650713e-06, + "logits/chosen": -2.38753604888916, + "logits/rejected": -2.387018918991089, + "logps/chosen": -249.96890258789062, + "logps/rejected": -239.8872833251953, + "loss": 0.6931, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01993386633694172, + "rewards/margins": 0.04759936034679413, + "rewards/rejected": -0.0675332322716713, + "step": 6740 + }, + { + "epoch": 0.44, + "learning_rate": 3.423442819988387e-06, + "logits/chosen": -2.2356374263763428, + "logits/rejected": -2.1222212314605713, + "logps/chosen": -190.93006896972656, + "logps/rejected": -186.69676208496094, + "loss": 0.7044, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07542804628610611, + "rewards/margins": 0.029561137780547142, + "rewards/rejected": -0.1049891859292984, + "step": 6750 + }, + { + "epoch": 0.44, + "learning_rate": 3.4181343129816e-06, + "logits/chosen": -2.4288418292999268, + "logits/rejected": -2.120914936065674, + "logps/chosen": -175.26609802246094, + "logps/rejected": -167.46408081054688, + "loss": 0.6891, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.010955859906971455, + "rewards/margins": 0.06488125026226044, + "rewards/rejected": -0.053925395011901855, + "step": 6760 + }, + { + "epoch": 0.44, + "learning_rate": 3.4128210165364837e-06, + "logits/chosen": -2.168002128601074, + "logits/rejected": -2.1744346618652344, + "logps/chosen": -187.72097778320312, + "logps/rejected": -209.94015502929688, + "loss": 0.6874, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03688240796327591, + "rewards/margins": 0.13695737719535828, + "rewards/rejected": -0.10007498413324356, + "step": 6770 + }, + { + "epoch": 0.44, + "learning_rate": 3.407502958369795e-06, + "logits/chosen": -2.279935836791992, + "logits/rejected": -2.1321768760681152, + "logps/chosen": -240.8074951171875, + "logps/rejected": -220.3392333984375, + "loss": 0.6873, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.016402950510382652, + "rewards/margins": 0.08744045346975327, + "rewards/rejected": -0.07103750109672546, + "step": 6780 + }, + { + "epoch": 0.44, + "learning_rate": 3.4021801662231297e-06, + "logits/chosen": -2.3217625617980957, + "logits/rejected": -2.168172836303711, + "logps/chosen": -249.3394317626953, + "logps/rejected": -232.941650390625, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004626649431884289, + "rewards/margins": 0.05877614766359329, + "rewards/rejected": -0.06340280920267105, + "step": 6790 + }, + { + "epoch": 0.44, + "learning_rate": 3.3968526678627793e-06, + "logits/chosen": -2.2708561420440674, + "logits/rejected": -1.9311338663101196, + "logps/chosen": -265.8199462890625, + "logps/rejected": -227.452392578125, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007894573733210564, + "rewards/margins": 0.0573604479432106, + "rewards/rejected": -0.06525502353906631, + "step": 6800 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -2.326197862625122, + "eval_logits/rejected": -2.137258529663086, + "eval_logps/chosen": -233.22720336914062, + "eval_logps/rejected": -221.09767150878906, + "eval_loss": 0.6899824142456055, + "eval_rewards/accuracies": 0.6345000267028809, + "eval_rewards/chosen": -0.012222343124449253, + "eval_rewards/margins": 0.08263525366783142, + "eval_rewards/rejected": -0.0948575884103775, + "eval_runtime": 709.2866, + "eval_samples_per_second": 2.82, + "eval_steps_per_second": 1.41, + "step": 6800 + }, + { + "epoch": 0.45, + "learning_rate": 3.391520491079586e-06, + "logits/chosen": -2.4146409034729004, + "logits/rejected": -2.4289791584014893, + "logps/chosen": -196.1842498779297, + "logps/rejected": -171.66029357910156, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0022164147812873125, + "rewards/margins": 0.06005682423710823, + "rewards/rejected": -0.062273234128952026, + "step": 6810 + }, + { + "epoch": 0.45, + "learning_rate": 3.3861836636887936e-06, + "logits/chosen": -2.3121285438537598, + "logits/rejected": -2.129061222076416, + "logps/chosen": -267.5245361328125, + "logps/rejected": -231.2607879638672, + "loss": 0.6885, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007056675851345062, + "rewards/margins": 0.10214730352163315, + "rewards/rejected": -0.1092039942741394, + "step": 6820 + }, + { + "epoch": 0.45, + "learning_rate": 3.3808422135299106e-06, + "logits/chosen": -2.2894935607910156, + "logits/rejected": -2.3739147186279297, + "logps/chosen": -286.0987243652344, + "logps/rejected": -346.9412536621094, + "loss": 0.691, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.004209049511700869, + "rewards/margins": 0.07767010480165482, + "rewards/rejected": -0.0818791538476944, + "step": 6830 + }, + { + "epoch": 0.45, + "learning_rate": 3.375496168466556e-06, + "logits/chosen": -2.468285083770752, + "logits/rejected": -2.0140810012817383, + "logps/chosen": -214.4819793701172, + "logps/rejected": -164.49362182617188, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004561436362564564, + "rewards/margins": 0.05938258767127991, + "rewards/rejected": -0.05482115224003792, + "step": 6840 + }, + { + "epoch": 0.45, + "learning_rate": 3.3701455563863205e-06, + "logits/chosen": -2.5253567695617676, + "logits/rejected": -2.0757806301116943, + "logps/chosen": -294.3965148925781, + "logps/rejected": -268.2409362792969, + "loss": 0.6867, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.025218481197953224, + "rewards/margins": 0.1016344428062439, + "rewards/rejected": -0.12685291469097137, + "step": 6850 + }, + { + "epoch": 0.45, + "learning_rate": 3.3647904052006174e-06, + "logits/chosen": -2.3288276195526123, + "logits/rejected": -2.264437675476074, + "logps/chosen": -267.79376220703125, + "logps/rejected": -278.85662841796875, + "loss": 0.6901, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.024954695254564285, + "rewards/margins": 0.08056318014860153, + "rewards/rejected": -0.10551787912845612, + "step": 6860 + }, + { + "epoch": 0.45, + "learning_rate": 3.3594307428445383e-06, + "logits/chosen": -2.529595375061035, + "logits/rejected": -2.1268649101257324, + "logps/chosen": -333.0338439941406, + "logps/rejected": -309.6853942871094, + "loss": 0.691, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0021690779831260443, + "rewards/margins": 0.08068062365055084, + "rewards/rejected": -0.0785115510225296, + "step": 6870 + }, + { + "epoch": 0.45, + "learning_rate": 3.354066597276707e-06, + "logits/chosen": -2.184455394744873, + "logits/rejected": -2.18107533454895, + "logps/chosen": -224.90921020507812, + "logps/rejected": -266.7405700683594, + "loss": 0.6899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.013449070043861866, + "rewards/margins": 0.0647086501121521, + "rewards/rejected": -0.07815771549940109, + "step": 6880 + }, + { + "epoch": 0.45, + "learning_rate": 3.348697996479136e-06, + "logits/chosen": -2.405324935913086, + "logits/rejected": -2.148305654525757, + "logps/chosen": -228.61373901367188, + "logps/rejected": -182.25930786132812, + "loss": 0.6907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.008047891780734062, + "rewards/margins": 0.07117293030023575, + "rewards/rejected": -0.07922081649303436, + "step": 6890 + }, + { + "epoch": 0.45, + "learning_rate": 3.3433249684570757e-06, + "logits/chosen": -2.224512815475464, + "logits/rejected": -2.0861592292785645, + "logps/chosen": -178.4225616455078, + "logps/rejected": -142.22152709960938, + "loss": 0.6881, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.00593004934489727, + "rewards/margins": 0.09517361223697662, + "rewards/rejected": -0.10110366344451904, + "step": 6900 + }, + { + "epoch": 0.45, + "eval_logits/chosen": -2.3220698833465576, + "eval_logits/rejected": -2.133631944656372, + "eval_logps/chosen": -231.9954071044922, + "eval_logps/rejected": -219.68096923828125, + "eval_loss": 0.6899513006210327, + "eval_rewards/accuracies": 0.6309999823570251, + "eval_rewards/chosen": 9.537512232782319e-05, + "eval_rewards/margins": 0.08078599721193314, + "eval_rewards/rejected": -0.08069062978029251, + "eval_runtime": 712.9402, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 6900 + }, + { + "epoch": 0.45, + "learning_rate": 3.3379475412388724e-06, + "logits/chosen": -2.405919075012207, + "logits/rejected": -2.2413978576660156, + "logps/chosen": -240.8171844482422, + "logps/rejected": -219.07095336914062, + "loss": 0.6885, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.01137818954885006, + "rewards/margins": 0.11952348798513412, + "rewards/rejected": -0.1081453189253807, + "step": 6910 + }, + { + "epoch": 0.45, + "learning_rate": 3.3325657428758207e-06, + "logits/chosen": -2.2108242511749268, + "logits/rejected": -2.1874756813049316, + "logps/chosen": -241.8357696533203, + "logps/rejected": -253.5496063232422, + "loss": 0.6872, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.018397245556116104, + "rewards/margins": 0.10887129604816437, + "rewards/rejected": -0.09047403931617737, + "step": 6920 + }, + { + "epoch": 0.45, + "learning_rate": 3.3271796014420175e-06, + "logits/chosen": -2.3145766258239746, + "logits/rejected": -2.254108428955078, + "logps/chosen": -214.22042846679688, + "logps/rejected": -208.04830932617188, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01770118437707424, + "rewards/margins": 0.11536221206188202, + "rewards/rejected": -0.133063405752182, + "step": 6930 + }, + { + "epoch": 0.45, + "learning_rate": 3.3217891450342142e-06, + "logits/chosen": -2.3170406818389893, + "logits/rejected": -1.9900795221328735, + "logps/chosen": -259.0184020996094, + "logps/rejected": -200.655029296875, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004199774004518986, + "rewards/margins": 0.10629435628652573, + "rewards/rejected": -0.11049413681030273, + "step": 6940 + }, + { + "epoch": 0.45, + "learning_rate": 3.3163944017716733e-06, + "logits/chosen": -2.4827568531036377, + "logits/rejected": -2.180410623550415, + "logps/chosen": -215.23617553710938, + "logps/rejected": -189.19430541992188, + "loss": 0.6907, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.01134683657437563, + "rewards/margins": 0.07055879384279251, + "rewards/rejected": -0.05921195074915886, + "step": 6950 + }, + { + "epoch": 0.46, + "learning_rate": 3.310995399796017e-06, + "logits/chosen": -2.421800374984741, + "logits/rejected": -2.30001163482666, + "logps/chosen": -274.8177185058594, + "logps/rejected": -272.7926940917969, + "loss": 0.6911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.008314954116940498, + "rewards/margins": 0.047770872712135315, + "rewards/rejected": -0.05608583241701126, + "step": 6960 + }, + { + "epoch": 0.46, + "learning_rate": 3.305592167271085e-06, + "logits/chosen": -2.302924871444702, + "logits/rejected": -2.2044272422790527, + "logps/chosen": -194.39797973632812, + "logps/rejected": -194.43634033203125, + "loss": 0.6886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.028481673449277878, + "rewards/margins": 0.10950399935245514, + "rewards/rejected": -0.08102231472730637, + "step": 6970 + }, + { + "epoch": 0.46, + "learning_rate": 3.3001847323827846e-06, + "logits/chosen": -2.295625925064087, + "logits/rejected": -2.242480516433716, + "logps/chosen": -266.0675354003906, + "logps/rejected": -272.8329162597656, + "loss": 0.6892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.020301584154367447, + "rewards/margins": 0.10184173285961151, + "rewards/rejected": -0.08154015243053436, + "step": 6980 + }, + { + "epoch": 0.46, + "learning_rate": 3.2947731233389447e-06, + "logits/chosen": -2.3801562786102295, + "logits/rejected": -2.004241943359375, + "logps/chosen": -247.1204376220703, + "logps/rejected": -202.22914123535156, + "loss": 0.6874, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0038624543230980635, + "rewards/margins": 0.10536074638366699, + "rewards/rejected": -0.10149829089641571, + "step": 6990 + }, + { + "epoch": 0.46, + "learning_rate": 3.2893573683691706e-06, + "logits/chosen": -2.195082902908325, + "logits/rejected": -2.2036478519439697, + "logps/chosen": -204.31845092773438, + "logps/rejected": -198.5324249267578, + "loss": 0.688, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.003750187810510397, + "rewards/margins": 0.12109758704900742, + "rewards/rejected": -0.11734740436077118, + "step": 7000 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -2.3213798999786377, + "eval_logits/rejected": -2.132957935333252, + "eval_logps/chosen": -232.3555450439453, + "eval_logps/rejected": -220.5653533935547, + "eval_loss": 0.689961850643158, + "eval_rewards/accuracies": 0.6255000233650208, + "eval_rewards/chosen": -0.00350601295940578, + "eval_rewards/margins": 0.08602865040302277, + "eval_rewards/rejected": -0.08953466266393661, + "eval_runtime": 712.7907, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 7000 + }, + { + "epoch": 0.46, + "learning_rate": 3.2839374957246915e-06, + "logits/chosen": -2.4078869819641113, + "logits/rejected": -2.1226019859313965, + "logps/chosen": -268.3845520019531, + "logps/rejected": -181.95008850097656, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.022942641749978065, + "rewards/margins": 0.051037006080150604, + "rewards/rejected": -0.07397964596748352, + "step": 7010 + }, + { + "epoch": 0.46, + "learning_rate": 3.2785135336782187e-06, + "logits/chosen": -2.250103712081909, + "logits/rejected": -2.076584577560425, + "logps/chosen": -234.687744140625, + "logps/rejected": -267.17193603515625, + "loss": 0.6892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007723203394562006, + "rewards/margins": 0.10171866416931152, + "rewards/rejected": -0.10944187641143799, + "step": 7020 + }, + { + "epoch": 0.46, + "learning_rate": 3.2730855105237952e-06, + "logits/chosen": -2.4183948040008545, + "logits/rejected": -2.2227070331573486, + "logps/chosen": -218.42391967773438, + "logps/rejected": -272.3433532714844, + "loss": 0.6905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.018525656312704086, + "rewards/margins": 0.09738930314779282, + "rewards/rejected": -0.07886365056037903, + "step": 7030 + }, + { + "epoch": 0.46, + "learning_rate": 3.2676534545766486e-06, + "logits/chosen": -2.2223055362701416, + "logits/rejected": -2.1927051544189453, + "logps/chosen": -211.2403106689453, + "logps/rejected": -206.35842895507812, + "loss": 0.691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.012292217463254929, + "rewards/margins": 0.05010954663157463, + "rewards/rejected": -0.037817325443029404, + "step": 7040 + }, + { + "epoch": 0.46, + "learning_rate": 3.262217394173043e-06, + "logits/chosen": -2.335088014602661, + "logits/rejected": -2.052690267562866, + "logps/chosen": -242.60922241210938, + "logps/rejected": -244.936279296875, + "loss": 0.6896, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.017585698515176773, + "rewards/margins": 0.08410107344388962, + "rewards/rejected": -0.10168677568435669, + "step": 7050 + }, + { + "epoch": 0.46, + "learning_rate": 3.2567773576701333e-06, + "logits/chosen": -2.157353162765503, + "logits/rejected": -1.9850852489471436, + "logps/chosen": -257.1738586425781, + "logps/rejected": -238.9078826904297, + "loss": 0.6863, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.014163834974169731, + "rewards/margins": 0.1204962283372879, + "rewards/rejected": -0.1346600502729416, + "step": 7060 + }, + { + "epoch": 0.46, + "learning_rate": 3.2513333734458154e-06, + "logits/chosen": -2.3726444244384766, + "logits/rejected": -2.276071310043335, + "logps/chosen": -207.281982421875, + "logps/rejected": -197.08543395996094, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.004406812135130167, + "rewards/margins": 0.06411116570234299, + "rewards/rejected": -0.06851796805858612, + "step": 7070 + }, + { + "epoch": 0.46, + "learning_rate": 3.245885469898576e-06, + "logits/chosen": -2.2665371894836426, + "logits/rejected": -2.051095962524414, + "logps/chosen": -300.17071533203125, + "logps/rejected": -246.3044891357422, + "loss": 0.6893, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.003844373393803835, + "rewards/margins": 0.09821848571300507, + "rewards/rejected": -0.09437411278486252, + "step": 7080 + }, + { + "epoch": 0.46, + "learning_rate": 3.2404336754473497e-06, + "logits/chosen": -2.263822317123413, + "logits/rejected": -2.0003104209899902, + "logps/chosen": -265.4076232910156, + "logps/rejected": -207.5909423828125, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.020993733778595924, + "rewards/margins": 0.069535993039608, + "rewards/rejected": -0.04854225367307663, + "step": 7090 + }, + { + "epoch": 0.46, + "learning_rate": 3.234978018531367e-06, + "logits/chosen": -2.587207555770874, + "logits/rejected": -2.156212329864502, + "logps/chosen": -256.6408996582031, + "logps/rejected": -198.82559204101562, + "loss": 0.6893, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.018593590706586838, + "rewards/margins": 0.06984353065490723, + "rewards/rejected": -0.05124994367361069, + "step": 7100 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -2.31289005279541, + "eval_logits/rejected": -2.125515937805176, + "eval_logps/chosen": -231.62696838378906, + "eval_logps/rejected": -219.47421264648438, + "eval_loss": 0.689959704875946, + "eval_rewards/accuracies": 0.6309999823570251, + "eval_rewards/chosen": 0.0037797146942466497, + "eval_rewards/margins": 0.08240301162004471, + "eval_rewards/rejected": -0.07862330228090286, + "eval_runtime": 712.5102, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.403, + "step": 7100 + }, + { + "epoch": 0.47, + "learning_rate": 3.229518527610006e-06, + "logits/chosen": -2.405752182006836, + "logits/rejected": -2.1117520332336426, + "logps/chosen": -291.0060119628906, + "logps/rejected": -252.95669555664062, + "loss": 0.691, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.017840737476944923, + "rewards/margins": 0.07126693427562714, + "rewards/rejected": -0.05342619866132736, + "step": 7110 + }, + { + "epoch": 0.47, + "learning_rate": 3.2240552311626465e-06, + "logits/chosen": -2.3919520378112793, + "logits/rejected": -2.1560964584350586, + "logps/chosen": -239.92935180664062, + "logps/rejected": -231.94467163085938, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012844247743487358, + "rewards/margins": 0.06960373371839523, + "rewards/rejected": -0.056759487837553024, + "step": 7120 + }, + { + "epoch": 0.47, + "learning_rate": 3.2185881576885193e-06, + "logits/chosen": -2.403320550918579, + "logits/rejected": -2.052623748779297, + "logps/chosen": -211.9319305419922, + "logps/rejected": -180.37948608398438, + "loss": 0.6908, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0020910645835101604, + "rewards/margins": 0.07115035504102707, + "rewards/rejected": -0.07324142754077911, + "step": 7130 + }, + { + "epoch": 0.47, + "learning_rate": 3.213117335706557e-06, + "logits/chosen": -2.279644727706909, + "logits/rejected": -2.392721176147461, + "logps/chosen": -256.3946838378906, + "logps/rejected": -271.6476135253906, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0034970403648912907, + "rewards/margins": 0.06046708673238754, + "rewards/rejected": -0.0639641284942627, + "step": 7140 + }, + { + "epoch": 0.47, + "learning_rate": 3.2076427937552473e-06, + "logits/chosen": -2.3162059783935547, + "logits/rejected": -2.0917744636535645, + "logps/chosen": -244.4017791748047, + "logps/rejected": -236.8706512451172, + "loss": 0.6869, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.010880010202527046, + "rewards/margins": 0.11592531204223633, + "rewards/rejected": -0.10504531860351562, + "step": 7150 + }, + { + "epoch": 0.47, + "learning_rate": 3.2021645603924827e-06, + "logits/chosen": -2.1680960655212402, + "logits/rejected": -2.0739083290100098, + "logps/chosen": -137.6443634033203, + "logps/rejected": -159.3563995361328, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008097528479993343, + "rewards/margins": 0.12347575277090073, + "rewards/rejected": -0.11537822335958481, + "step": 7160 + }, + { + "epoch": 0.47, + "learning_rate": 3.196682664195412e-06, + "logits/chosen": -2.2917988300323486, + "logits/rejected": -2.0159618854522705, + "logps/chosen": -205.24533081054688, + "logps/rejected": -175.20578002929688, + "loss": 0.6925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.011971285566687584, + "rewards/margins": 0.048515014350414276, + "rewards/rejected": -0.06048629805445671, + "step": 7170 + }, + { + "epoch": 0.47, + "learning_rate": 3.191197133760291e-06, + "logits/chosen": -2.5555553436279297, + "logits/rejected": -2.229135751724243, + "logps/chosen": -262.1866760253906, + "logps/rejected": -200.24937438964844, + "loss": 0.6871, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.052167803049087524, + "rewards/margins": 0.1231146901845932, + "rewards/rejected": -0.07094688713550568, + "step": 7180 + }, + { + "epoch": 0.47, + "learning_rate": 3.185707997702334e-06, + "logits/chosen": -2.212904453277588, + "logits/rejected": -2.0473551750183105, + "logps/chosen": -240.8466033935547, + "logps/rejected": -211.2018585205078, + "loss": 0.6891, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0008177075651474297, + "rewards/margins": 0.08206796646118164, + "rewards/rejected": -0.08288567513227463, + "step": 7190 + }, + { + "epoch": 0.47, + "learning_rate": 3.1802152846555624e-06, + "logits/chosen": -2.245023727416992, + "logits/rejected": -2.220525026321411, + "logps/chosen": -222.89584350585938, + "logps/rejected": -215.0545654296875, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013471787795424461, + "rewards/margins": 0.07678040862083435, + "rewards/rejected": -0.06330861151218414, + "step": 7200 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -2.3261678218841553, + "eval_logits/rejected": -2.1375534534454346, + "eval_logps/chosen": -230.54725646972656, + "eval_logps/rejected": -217.60206604003906, + "eval_loss": 0.6899964213371277, + "eval_rewards/accuracies": 0.621999979019165, + "eval_rewards/chosen": 0.014577223919332027, + "eval_rewards/margins": 0.07447873055934906, + "eval_rewards/rejected": -0.059901509433984756, + "eval_runtime": 713.9129, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.401, + "step": 7200 + }, + { + "epoch": 0.47, + "learning_rate": 3.174719023272659e-06, + "logits/chosen": -2.4018406867980957, + "logits/rejected": -2.4472343921661377, + "logps/chosen": -212.6780242919922, + "logps/rejected": -267.15069580078125, + "loss": 0.6882, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.010161024518311024, + "rewards/margins": 0.08123484998941422, + "rewards/rejected": -0.07107381522655487, + "step": 7210 + }, + { + "epoch": 0.47, + "learning_rate": 3.169219242224816e-06, + "logits/chosen": -2.3529484272003174, + "logits/rejected": -2.1854114532470703, + "logps/chosen": -240.903564453125, + "logps/rejected": -241.4931640625, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00976332277059555, + "rewards/margins": 0.06684872508049011, + "rewards/rejected": -0.07661206275224686, + "step": 7220 + }, + { + "epoch": 0.47, + "learning_rate": 3.1637159702015837e-06, + "logits/chosen": -2.3580760955810547, + "logits/rejected": -2.0424869060516357, + "logps/chosen": -205.4845428466797, + "logps/rejected": -197.5430450439453, + "loss": 0.6888, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.004075945820659399, + "rewards/margins": 0.11381276696920395, + "rewards/rejected": -0.10973681509494781, + "step": 7230 + }, + { + "epoch": 0.47, + "learning_rate": 3.1582092359107263e-06, + "logits/chosen": -2.4041996002197266, + "logits/rejected": -2.184713840484619, + "logps/chosen": -278.28271484375, + "logps/rejected": -241.3439483642578, + "loss": 0.6935, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.9874186515808105e-05, + "rewards/margins": 0.08891940116882324, + "rewards/rejected": -0.08896928280591965, + "step": 7240 + }, + { + "epoch": 0.47, + "learning_rate": 3.152699068078067e-06, + "logits/chosen": -2.2145161628723145, + "logits/rejected": -2.00947904586792, + "logps/chosen": -284.87933349609375, + "logps/rejected": -276.2130432128906, + "loss": 0.6879, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.045890286564826965, + "rewards/margins": 0.11874841153621674, + "rewards/rejected": -0.1646386981010437, + "step": 7250 + }, + { + "epoch": 0.48, + "learning_rate": 3.1471854954473415e-06, + "logits/chosen": -2.354721784591675, + "logits/rejected": -2.3982090950012207, + "logps/chosen": -248.9252471923828, + "logps/rejected": -255.275634765625, + "loss": 0.6894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.018414465710520744, + "rewards/margins": 0.10118373483419418, + "rewards/rejected": -0.08276927471160889, + "step": 7260 + }, + { + "epoch": 0.48, + "learning_rate": 3.1416685467800436e-06, + "logits/chosen": -2.1534171104431152, + "logits/rejected": -2.116290330886841, + "logps/chosen": -188.9571990966797, + "logps/rejected": -176.0358428955078, + "loss": 0.6901, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03374389559030533, + "rewards/margins": 0.09608308225870132, + "rewards/rejected": -0.12982699275016785, + "step": 7270 + }, + { + "epoch": 0.48, + "learning_rate": 3.1361482508552803e-06, + "logits/chosen": -2.3551740646362305, + "logits/rejected": -1.8543596267700195, + "logps/chosen": -242.98074340820312, + "logps/rejected": -206.5134735107422, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.010297993198037148, + "rewards/margins": 0.06919042766094208, + "rewards/rejected": -0.07948841899633408, + "step": 7280 + }, + { + "epoch": 0.48, + "learning_rate": 3.1306246364696198e-06, + "logits/chosen": -2.481549024581909, + "logits/rejected": -2.280365228652954, + "logps/chosen": -251.1072235107422, + "logps/rejected": -242.42062377929688, + "loss": 0.6903, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013671429827809334, + "rewards/margins": 0.08818572014570236, + "rewards/rejected": -0.07451429218053818, + "step": 7290 + }, + { + "epoch": 0.48, + "learning_rate": 3.1250977324369413e-06, + "logits/chosen": -2.2711830139160156, + "logits/rejected": -2.1912612915039062, + "logps/chosen": -157.5973663330078, + "logps/rejected": -173.55458068847656, + "loss": 0.6907, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.013550999574363232, + "rewards/margins": 0.09245215356349945, + "rewards/rejected": -0.07890114188194275, + "step": 7300 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -2.314774990081787, + "eval_logits/rejected": -2.127025604248047, + "eval_logps/chosen": -232.7455596923828, + "eval_logps/rejected": -220.20619201660156, + "eval_loss": 0.6899450421333313, + "eval_rewards/accuracies": 0.6290000081062317, + "eval_rewards/chosen": -0.007406196556985378, + "eval_rewards/margins": 0.07853667438030243, + "eval_rewards/rejected": -0.08594285696744919, + "eval_runtime": 714.0048, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.401, + "step": 7300 + }, + { + "epoch": 0.48, + "learning_rate": 3.1195675675882825e-06, + "logits/chosen": -2.2243969440460205, + "logits/rejected": -2.1143479347229004, + "logps/chosen": -238.1915740966797, + "logps/rejected": -213.28317260742188, + "loss": 0.6909, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.030005771666765213, + "rewards/margins": 0.05696401745080948, + "rewards/rejected": -0.0869697779417038, + "step": 7310 + }, + { + "epoch": 0.48, + "learning_rate": 3.1140341707716926e-06, + "logits/chosen": -2.176473617553711, + "logits/rejected": -1.929321527481079, + "logps/chosen": -196.3441162109375, + "logps/rejected": -165.09817504882812, + "loss": 0.6866, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.005364052020013332, + "rewards/margins": 0.11770284175872803, + "rewards/rejected": -0.12306687980890274, + "step": 7320 + }, + { + "epoch": 0.48, + "learning_rate": 3.1084975708520803e-06, + "logits/chosen": -2.4346864223480225, + "logits/rejected": -2.034972906112671, + "logps/chosen": -260.06732177734375, + "logps/rejected": -198.18801879882812, + "loss": 0.6909, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.016565248370170593, + "rewards/margins": 0.08867197483778, + "rewards/rejected": -0.0721067264676094, + "step": 7330 + }, + { + "epoch": 0.48, + "learning_rate": 3.1029577967110625e-06, + "logits/chosen": -2.4259464740753174, + "logits/rejected": -2.2102203369140625, + "logps/chosen": -216.46865844726562, + "logps/rejected": -171.20114135742188, + "loss": 0.6911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0007433153805322945, + "rewards/margins": 0.03790837153792381, + "rewards/rejected": -0.038651686161756516, + "step": 7340 + }, + { + "epoch": 0.48, + "learning_rate": 3.097414877246814e-06, + "logits/chosen": -2.2673535346984863, + "logits/rejected": -2.0183987617492676, + "logps/chosen": -197.57110595703125, + "logps/rejected": -177.40805053710938, + "loss": 0.6862, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.010723413899540901, + "rewards/margins": 0.09349813312292099, + "rewards/rejected": -0.08277471363544464, + "step": 7350 + }, + { + "epoch": 0.48, + "learning_rate": 3.0918688413739197e-06, + "logits/chosen": -2.3423869609832764, + "logits/rejected": -2.030740261077881, + "logps/chosen": -229.8058319091797, + "logps/rejected": -176.20933532714844, + "loss": 0.6871, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.028771137818694115, + "rewards/margins": 0.10676223039627075, + "rewards/rejected": -0.07799109816551208, + "step": 7360 + }, + { + "epoch": 0.48, + "learning_rate": 3.0863197180232178e-06, + "logits/chosen": -2.3902642726898193, + "logits/rejected": -2.0197548866271973, + "logps/chosen": -197.1629638671875, + "logps/rejected": -192.62374877929688, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.025812137871980667, + "rewards/margins": 0.07681456953287125, + "rewards/rejected": -0.051002420485019684, + "step": 7370 + }, + { + "epoch": 0.48, + "learning_rate": 3.0807675361416554e-06, + "logits/chosen": -2.2906887531280518, + "logits/rejected": -2.047722339630127, + "logps/chosen": -186.08908081054688, + "logps/rejected": -114.9020004272461, + "loss": 0.6903, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02791173756122589, + "rewards/margins": 0.0784730315208435, + "rewards/rejected": -0.050561290234327316, + "step": 7380 + }, + { + "epoch": 0.48, + "learning_rate": 3.0752123246921327e-06, + "logits/chosen": -2.4102749824523926, + "logits/rejected": -2.1352226734161377, + "logps/chosen": -277.206298828125, + "logps/rejected": -218.3984832763672, + "loss": 0.6903, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.007069566752761602, + "rewards/margins": 0.08123649656772614, + "rewards/rejected": -0.0741669163107872, + "step": 7390 + }, + { + "epoch": 0.48, + "learning_rate": 3.069654112653353e-06, + "logits/chosen": -2.4256751537323, + "logits/rejected": -2.2184150218963623, + "logps/chosen": -216.2940216064453, + "logps/rejected": -188.50967407226562, + "loss": 0.6931, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0019019130850210786, + "rewards/margins": 0.04767733812332153, + "rewards/rejected": -0.04577542468905449, + "step": 7400 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -2.311318874359131, + "eval_logits/rejected": -2.123793601989746, + "eval_logps/chosen": -231.12094116210938, + "eval_logps/rejected": -218.42494201660156, + "eval_loss": 0.6899686455726624, + "eval_rewards/accuracies": 0.6284999847412109, + "eval_rewards/chosen": 0.008840080350637436, + "eval_rewards/margins": 0.07697049528360367, + "eval_rewards/rejected": -0.06813041865825653, + "eval_runtime": 713.3758, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 7400 + }, + { + "epoch": 0.48, + "learning_rate": 3.064092929019673e-06, + "logits/chosen": -2.3009819984436035, + "logits/rejected": -2.3356680870056152, + "logps/chosen": -256.2542419433594, + "logps/rejected": -281.6364440917969, + "loss": 0.6916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.012892307713627815, + "rewards/margins": 0.05507419630885124, + "rewards/rejected": -0.04218188300728798, + "step": 7410 + }, + { + "epoch": 0.49, + "learning_rate": 3.058528802800952e-06, + "logits/chosen": -2.342904567718506, + "logits/rejected": -2.102327823638916, + "logps/chosen": -290.8139953613281, + "logps/rejected": -261.8834228515625, + "loss": 0.691, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01819724775850773, + "rewards/margins": 0.07642240822315216, + "rewards/rejected": -0.05822516605257988, + "step": 7420 + }, + { + "epoch": 0.49, + "learning_rate": 3.052961763022397e-06, + "logits/chosen": -2.481123685836792, + "logits/rejected": -2.149416208267212, + "logps/chosen": -183.3396453857422, + "logps/rejected": -155.44528198242188, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004584819078445435, + "rewards/margins": 0.10829279571771622, + "rewards/rejected": -0.11287760734558105, + "step": 7430 + }, + { + "epoch": 0.49, + "learning_rate": 3.047391838724415e-06, + "logits/chosen": -2.444658041000366, + "logits/rejected": -1.982791543006897, + "logps/chosen": -232.76083374023438, + "logps/rejected": -227.51760864257812, + "loss": 0.6888, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02757873572409153, + "rewards/margins": 0.10823357105255127, + "rewards/rejected": -0.08065483719110489, + "step": 7440 + }, + { + "epoch": 0.49, + "learning_rate": 3.0418190589624587e-06, + "logits/chosen": -2.3566372394561768, + "logits/rejected": -2.09330153465271, + "logps/chosen": -178.53126525878906, + "logps/rejected": -192.35299682617188, + "loss": 0.6925, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.009091891348361969, + "rewards/margins": 0.05469350144267082, + "rewards/rejected": -0.06378540396690369, + "step": 7450 + }, + { + "epoch": 0.49, + "learning_rate": 3.0362434528068784e-06, + "logits/chosen": -2.3358893394470215, + "logits/rejected": -1.9141845703125, + "logps/chosen": -268.863037109375, + "logps/rejected": -194.3677978515625, + "loss": 0.688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004601252265274525, + "rewards/margins": 0.10998895019292831, + "rewards/rejected": -0.10538768768310547, + "step": 7460 + }, + { + "epoch": 0.49, + "learning_rate": 3.0306650493427657e-06, + "logits/chosen": -2.2316243648529053, + "logits/rejected": -2.127760410308838, + "logps/chosen": -230.8894500732422, + "logps/rejected": -230.5992889404297, + "loss": 0.6894, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007297568954527378, + "rewards/margins": 0.08671748638153076, + "rewards/rejected": -0.07941991835832596, + "step": 7470 + }, + { + "epoch": 0.49, + "learning_rate": 3.0250838776698077e-06, + "logits/chosen": -2.096904754638672, + "logits/rejected": -2.1422505378723145, + "logps/chosen": -184.96865844726562, + "logps/rejected": -193.18240356445312, + "loss": 0.6862, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010493558831512928, + "rewards/margins": 0.10960110276937485, + "rewards/rejected": -0.1200946569442749, + "step": 7480 + }, + { + "epoch": 0.49, + "learning_rate": 3.0194999669021275e-06, + "logits/chosen": -2.098390579223633, + "logits/rejected": -1.7727285623550415, + "logps/chosen": -226.0522003173828, + "logps/rejected": -189.67776489257812, + "loss": 0.6894, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.008162322454154491, + "rewards/margins": 0.10677523910999298, + "rewards/rejected": -0.09861291944980621, + "step": 7490 + }, + { + "epoch": 0.49, + "learning_rate": 3.0139133461681403e-06, + "logits/chosen": -2.243513822555542, + "logits/rejected": -2.0963521003723145, + "logps/chosen": -263.7023010253906, + "logps/rejected": -215.9556121826172, + "loss": 0.6895, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006919173989444971, + "rewards/margins": 0.08393418788909912, + "rewards/rejected": -0.07701500505208969, + "step": 7500 + }, + { + "epoch": 0.49, + "eval_logits/chosen": -2.286125898361206, + "eval_logits/rejected": -2.1006903648376465, + "eval_logps/chosen": -231.99969482421875, + "eval_logps/rejected": -219.49575805664062, + "eval_loss": 0.6899300813674927, + "eval_rewards/accuracies": 0.628000020980835, + "eval_rewards/chosen": 5.265325307846069e-05, + "eval_rewards/margins": 0.07889124006032944, + "eval_rewards/rejected": -0.07883859425783157, + "eval_runtime": 711.1311, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 7500 + }, + { + "epoch": 0.49, + "learning_rate": 3.0083240446103965e-06, + "logits/chosen": -2.0148041248321533, + "logits/rejected": -1.978687047958374, + "logps/chosen": -184.3916015625, + "logps/rejected": -200.95066833496094, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0035699151922017336, + "rewards/margins": 0.08889930695295334, + "rewards/rejected": -0.0924692153930664, + "step": 7510 + }, + { + "epoch": 0.49, + "learning_rate": 3.0027320913854306e-06, + "logits/chosen": -2.4875292778015137, + "logits/rejected": -2.200932025909424, + "logps/chosen": -291.66192626953125, + "logps/rejected": -237.73507690429688, + "loss": 0.6914, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.007996944710612297, + "rewards/margins": 0.08782526105642319, + "rewards/rejected": -0.07982831448316574, + "step": 7520 + }, + { + "epoch": 0.49, + "learning_rate": 2.997137515663609e-06, + "logits/chosen": -2.2359402179718018, + "logits/rejected": -2.1508307456970215, + "logps/chosen": -223.72048950195312, + "logps/rejected": -195.07565307617188, + "loss": 0.6891, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01656939461827278, + "rewards/margins": 0.09054360538721085, + "rewards/rejected": -0.07397421449422836, + "step": 7530 + }, + { + "epoch": 0.49, + "learning_rate": 2.991540346628981e-06, + "logits/chosen": -2.342252492904663, + "logits/rejected": -2.15889573097229, + "logps/chosen": -238.47000122070312, + "logps/rejected": -218.312744140625, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011303565464913845, + "rewards/margins": 0.05950998514890671, + "rewards/rejected": -0.048206426203250885, + "step": 7540 + }, + { + "epoch": 0.49, + "learning_rate": 2.985940613479121e-06, + "logits/chosen": -2.4330556392669678, + "logits/rejected": -2.323356866836548, + "logps/chosen": -292.80767822265625, + "logps/rejected": -241.87033081054688, + "loss": 0.6897, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.010419869795441628, + "rewards/margins": 0.08434535562992096, + "rewards/rejected": -0.07392548024654388, + "step": 7550 + }, + { + "epoch": 0.49, + "learning_rate": 2.980338345424981e-06, + "logits/chosen": -2.2963860034942627, + "logits/rejected": -1.9866485595703125, + "logps/chosen": -248.8218536376953, + "logps/rejected": -206.16690063476562, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003979469649493694, + "rewards/margins": 0.05621809884905815, + "rewards/rejected": -0.05223863199353218, + "step": 7560 + }, + { + "epoch": 0.5, + "learning_rate": 2.974733571690735e-06, + "logits/chosen": -2.3758111000061035, + "logits/rejected": -2.0978314876556396, + "logps/chosen": -238.9188232421875, + "logps/rejected": -186.61029052734375, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.015120044350624084, + "rewards/margins": 0.0698397159576416, + "rewards/rejected": -0.08495976030826569, + "step": 7570 + }, + { + "epoch": 0.5, + "learning_rate": 2.9691263215136274e-06, + "logits/chosen": -2.339653730392456, + "logits/rejected": -2.3221707344055176, + "logps/chosen": -263.97918701171875, + "logps/rejected": -240.21945190429688, + "loss": 0.6909, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03021194040775299, + "rewards/margins": 0.07894815504550934, + "rewards/rejected": -0.048736222088336945, + "step": 7580 + }, + { + "epoch": 0.5, + "learning_rate": 2.963516624143823e-06, + "logits/chosen": -2.230799436569214, + "logits/rejected": -2.1142804622650146, + "logps/chosen": -213.874755859375, + "logps/rejected": -185.1417694091797, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013080480508506298, + "rewards/margins": 0.0879780501127243, + "rewards/rejected": -0.10105852037668228, + "step": 7590 + }, + { + "epoch": 0.5, + "learning_rate": 2.9579045088442504e-06, + "logits/chosen": -2.1219449043273926, + "logits/rejected": -2.1596245765686035, + "logps/chosen": -189.5128631591797, + "logps/rejected": -221.0270538330078, + "loss": 0.6874, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.010078911669552326, + "rewards/margins": 0.11765221506357193, + "rewards/rejected": -0.12773114442825317, + "step": 7600 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.288795232772827, + "eval_logits/rejected": -2.1032533645629883, + "eval_logps/chosen": -232.4485321044922, + "eval_logps/rejected": -220.70327758789062, + "eval_loss": 0.6899698376655579, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -0.004435794893652201, + "eval_rewards/margins": 0.08647802472114563, + "eval_rewards/rejected": -0.0909138172864914, + "eval_runtime": 714.3681, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 7600 + }, + { + "epoch": 0.5, + "learning_rate": 2.9522900048904534e-06, + "logits/chosen": -2.2064361572265625, + "logits/rejected": -2.1144938468933105, + "logps/chosen": -244.34390258789062, + "logps/rejected": -218.4773406982422, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.046057600528001785, + "rewards/margins": 0.041549500077962875, + "rewards/rejected": -0.08760710060596466, + "step": 7610 + }, + { + "epoch": 0.5, + "learning_rate": 2.9466731415704343e-06, + "logits/chosen": -2.2881698608398438, + "logits/rejected": -2.161687135696411, + "logps/chosen": -225.63803100585938, + "logps/rejected": -229.7403564453125, + "loss": 0.6919, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.018513670191168785, + "rewards/margins": 0.0846969336271286, + "rewards/rejected": -0.10321060568094254, + "step": 7620 + }, + { + "epoch": 0.5, + "learning_rate": 2.941053948184503e-06, + "logits/chosen": -2.339186668395996, + "logits/rejected": -2.1879472732543945, + "logps/chosen": -279.9281921386719, + "logps/rejected": -249.49502563476562, + "loss": 0.6916, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00010864362411666662, + "rewards/margins": 0.04479020833969116, + "rewards/rejected": -0.04468156024813652, + "step": 7630 + }, + { + "epoch": 0.5, + "learning_rate": 2.935432454045125e-06, + "logits/chosen": -2.1335971355438232, + "logits/rejected": -2.1970443725585938, + "logps/chosen": -233.8820037841797, + "logps/rejected": -216.0299835205078, + "loss": 0.6916, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.019176747649908066, + "rewards/margins": 0.033554740250110626, + "rewards/rejected": -0.05273149162530899, + "step": 7640 + }, + { + "epoch": 0.5, + "learning_rate": 2.929808688476768e-06, + "logits/chosen": -2.363029956817627, + "logits/rejected": -2.2410759925842285, + "logps/chosen": -240.63131713867188, + "logps/rejected": -230.9453887939453, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.012689086608588696, + "rewards/margins": 0.07916554063558578, + "rewards/rejected": -0.09185463190078735, + "step": 7650 + }, + { + "epoch": 0.5, + "learning_rate": 2.924182680815748e-06, + "logits/chosen": -2.2831203937530518, + "logits/rejected": -2.210198402404785, + "logps/chosen": -232.90261840820312, + "logps/rejected": -222.2987823486328, + "loss": 0.6881, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.017279820516705513, + "rewards/margins": 0.12103778123855591, + "rewards/rejected": -0.10375796258449554, + "step": 7660 + }, + { + "epoch": 0.5, + "learning_rate": 2.9185544604100765e-06, + "logits/chosen": -2.063129186630249, + "logits/rejected": -1.9675689935684204, + "logps/chosen": -198.81106567382812, + "logps/rejected": -202.04229736328125, + "loss": 0.6891, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.02900967001914978, + "rewards/margins": 0.07486601918935776, + "rewards/rejected": -0.10387568175792694, + "step": 7670 + }, + { + "epoch": 0.5, + "learning_rate": 2.9129240566193083e-06, + "logits/chosen": -2.3740992546081543, + "logits/rejected": -2.0523669719696045, + "logps/chosen": -202.94161987304688, + "logps/rejected": -199.41842651367188, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.00861315242946148, + "rewards/margins": 0.09764768928289413, + "rewards/rejected": -0.10626085102558136, + "step": 7680 + }, + { + "epoch": 0.5, + "learning_rate": 2.9072914988143874e-06, + "logits/chosen": -2.1358678340911865, + "logits/rejected": -2.0475707054138184, + "logps/chosen": -201.20492553710938, + "logps/rejected": -204.98117065429688, + "loss": 0.6885, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.002248757751658559, + "rewards/margins": 0.12247662246227264, + "rewards/rejected": -0.1202278584241867, + "step": 7690 + }, + { + "epoch": 0.5, + "learning_rate": 2.9016568163774956e-06, + "logits/chosen": -2.360272169113159, + "logits/rejected": -2.1226887702941895, + "logps/chosen": -172.92312622070312, + "logps/rejected": -146.11019897460938, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006938323378562927, + "rewards/margins": 0.07239948213100433, + "rewards/rejected": -0.07933782041072845, + "step": 7700 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.2827038764953613, + "eval_logits/rejected": -2.097726583480835, + "eval_logps/chosen": -231.82516479492188, + "eval_logps/rejected": -219.7780303955078, + "eval_loss": 0.6899218559265137, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": 0.001797709846869111, + "eval_rewards/margins": 0.08345920592546463, + "eval_rewards/rejected": -0.08166150003671646, + "eval_runtime": 710.0146, + "eval_samples_per_second": 2.817, + "eval_steps_per_second": 1.408, + "step": 7700 + }, + { + "epoch": 0.5, + "learning_rate": 2.8960200387018942e-06, + "logits/chosen": -2.1221325397491455, + "logits/rejected": -2.0857224464416504, + "logps/chosen": -308.96600341796875, + "logps/rejected": -268.85888671875, + "loss": 0.6925, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.012330361641943455, + "rewards/margins": 0.08727259188890457, + "rewards/rejected": -0.09960294514894485, + "step": 7710 + }, + { + "epoch": 0.51, + "learning_rate": 2.8903811951917792e-06, + "logits/chosen": -2.2766757011413574, + "logits/rejected": -2.124586582183838, + "logps/chosen": -199.05517578125, + "logps/rejected": -159.13063049316406, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007754988968372345, + "rewards/margins": 0.07217663526535034, + "rewards/rejected": -0.07993160933256149, + "step": 7720 + }, + { + "epoch": 0.51, + "learning_rate": 2.88474031526212e-06, + "logits/chosen": -2.2419610023498535, + "logits/rejected": -2.2114017009735107, + "logps/chosen": -203.2736053466797, + "logps/rejected": -222.61083984375, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.004338492639362812, + "rewards/margins": 0.05908681079745293, + "rewards/rejected": -0.06342529505491257, + "step": 7730 + }, + { + "epoch": 0.51, + "learning_rate": 2.879097428338509e-06, + "logits/chosen": -2.2317774295806885, + "logits/rejected": -1.9235107898712158, + "logps/chosen": -217.020751953125, + "logps/rejected": -202.31373596191406, + "loss": 0.6895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.011257743462920189, + "rewards/margins": 0.0762132927775383, + "rewards/rejected": -0.08747103810310364, + "step": 7740 + }, + { + "epoch": 0.51, + "learning_rate": 2.8734525638570094e-06, + "logits/chosen": -2.234351634979248, + "logits/rejected": -2.1596150398254395, + "logps/chosen": -232.974853515625, + "logps/rejected": -226.3662567138672, + "loss": 0.6933, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.012193715199828148, + "rewards/margins": 0.05764711648225784, + "rewards/rejected": -0.04545340687036514, + "step": 7750 + }, + { + "epoch": 0.51, + "learning_rate": 2.8678057512639982e-06, + "logits/chosen": -2.181051254272461, + "logits/rejected": -2.088076114654541, + "logps/chosen": -284.4569091796875, + "logps/rejected": -273.23193359375, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.016095653176307678, + "rewards/margins": 0.10834445059299469, + "rewards/rejected": -0.09224879741668701, + "step": 7760 + }, + { + "epoch": 0.51, + "learning_rate": 2.8621570200160172e-06, + "logits/chosen": -2.0607194900512695, + "logits/rejected": -1.9694864749908447, + "logps/chosen": -167.73159790039062, + "logps/rejected": -169.00257873535156, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.008374934084713459, + "rewards/margins": 0.09448956698179245, + "rewards/rejected": -0.08611463010311127, + "step": 7770 + }, + { + "epoch": 0.51, + "learning_rate": 2.856506399579615e-06, + "logits/chosen": -2.414057493209839, + "logits/rejected": -2.0493381023406982, + "logps/chosen": -222.3790740966797, + "logps/rejected": -209.5500946044922, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04015577957034111, + "rewards/margins": 0.06660310924053192, + "rewards/rejected": -0.10675889253616333, + "step": 7780 + }, + { + "epoch": 0.51, + "learning_rate": 2.8508539194311964e-06, + "logits/chosen": -2.3235208988189697, + "logits/rejected": -2.316335916519165, + "logps/chosen": -255.4383544921875, + "logps/rejected": -273.5601806640625, + "loss": 0.6911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.017049241811037064, + "rewards/margins": 0.052917949855327606, + "rewards/rejected": -0.06996718794107437, + "step": 7790 + }, + { + "epoch": 0.51, + "learning_rate": 2.8451996090568656e-06, + "logits/chosen": -2.2277419567108154, + "logits/rejected": -2.1044132709503174, + "logps/chosen": -189.9392547607422, + "logps/rejected": -181.34808349609375, + "loss": 0.6885, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03799670934677124, + "rewards/margins": 0.09370444715023041, + "rewards/rejected": -0.13170115649700165, + "step": 7800 + }, + { + "epoch": 0.51, + "eval_logits/chosen": -2.2713305950164795, + "eval_logits/rejected": -2.086493968963623, + "eval_logps/chosen": -235.31703186035156, + "eval_logps/rejected": -223.4753875732422, + "eval_loss": 0.6899875402450562, + "eval_rewards/accuracies": 0.6485000252723694, + "eval_rewards/chosen": -0.03312075883150101, + "eval_rewards/margins": 0.08551418036222458, + "eval_rewards/rejected": -0.11863493919372559, + "eval_runtime": 707.6852, + "eval_samples_per_second": 2.826, + "eval_steps_per_second": 1.413, + "step": 7800 + }, + { + "epoch": 0.51, + "learning_rate": 2.839543497952276e-06, + "logits/chosen": -2.1599411964416504, + "logits/rejected": -2.2668697834014893, + "logps/chosen": -189.414306640625, + "logps/rejected": -190.27862548828125, + "loss": 0.6908, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04804200679063797, + "rewards/margins": 0.09568199515342712, + "rewards/rejected": -0.1437240093946457, + "step": 7810 + }, + { + "epoch": 0.51, + "learning_rate": 2.833885615622474e-06, + "logits/chosen": -2.2124152183532715, + "logits/rejected": -2.0647387504577637, + "logps/chosen": -208.2938995361328, + "logps/rejected": -225.68045043945312, + "loss": 0.6928, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05718092992901802, + "rewards/margins": 0.07911114394664764, + "rewards/rejected": -0.13629207015037537, + "step": 7820 + }, + { + "epoch": 0.51, + "learning_rate": 2.8282259915817454e-06, + "logits/chosen": -1.902604341506958, + "logits/rejected": -2.096595287322998, + "logps/chosen": -144.83163452148438, + "logps/rejected": -194.21328735351562, + "loss": 0.6882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.038869358599185944, + "rewards/margins": 0.09170033037662506, + "rewards/rejected": -0.1305696964263916, + "step": 7830 + }, + { + "epoch": 0.51, + "learning_rate": 2.8225646553534614e-06, + "logits/chosen": -2.0661423206329346, + "logits/rejected": -1.9575055837631226, + "logps/chosen": -201.17019653320312, + "logps/rejected": -204.97335815429688, + "loss": 0.6915, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01854178123176098, + "rewards/margins": 0.07094518840312958, + "rewards/rejected": -0.0894869714975357, + "step": 7840 + }, + { + "epoch": 0.51, + "learning_rate": 2.8169016364699255e-06, + "logits/chosen": -2.2641472816467285, + "logits/rejected": -1.9965393543243408, + "logps/chosen": -217.72933959960938, + "logps/rejected": -225.2642822265625, + "loss": 0.6926, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05841469764709473, + "rewards/margins": 0.057847362011671066, + "rewards/rejected": -0.1162620559334755, + "step": 7850 + }, + { + "epoch": 0.51, + "learning_rate": 2.811236964472217e-06, + "logits/chosen": -2.3709425926208496, + "logits/rejected": -2.0033254623413086, + "logps/chosen": -314.4047546386719, + "logps/rejected": -261.5574035644531, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03998160362243652, + "rewards/margins": 0.07272603362798691, + "rewards/rejected": -0.11270763725042343, + "step": 7860 + }, + { + "epoch": 0.51, + "learning_rate": 2.805570668910041e-06, + "logits/chosen": -2.0802268981933594, + "logits/rejected": -2.0542476177215576, + "logps/chosen": -177.2976837158203, + "logps/rejected": -247.8351593017578, + "loss": 0.6897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07658363878726959, + "rewards/margins": 0.07037156820297241, + "rewards/rejected": -0.146955206990242, + "step": 7870 + }, + { + "epoch": 0.52, + "learning_rate": 2.7999027793415695e-06, + "logits/chosen": -2.464724063873291, + "logits/rejected": -1.99410879611969, + "logps/chosen": -250.383544921875, + "logps/rejected": -210.59585571289062, + "loss": 0.6918, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.054675567895174026, + "rewards/margins": 0.051379382610321045, + "rewards/rejected": -0.10605494678020477, + "step": 7880 + }, + { + "epoch": 0.52, + "learning_rate": 2.794233325333293e-06, + "logits/chosen": -2.1549906730651855, + "logits/rejected": -2.0477986335754395, + "logps/chosen": -261.9752502441406, + "logps/rejected": -247.65414428710938, + "loss": 0.6893, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03693775087594986, + "rewards/margins": 0.09356243908405304, + "rewards/rejected": -0.1305001974105835, + "step": 7890 + }, + { + "epoch": 0.52, + "learning_rate": 2.7885623364598597e-06, + "logits/chosen": -2.3811306953430176, + "logits/rejected": -2.0712475776672363, + "logps/chosen": -270.0716247558594, + "logps/rejected": -237.21182250976562, + "loss": 0.6905, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06379027664661407, + "rewards/margins": 0.09059080481529236, + "rewards/rejected": -0.15438108146190643, + "step": 7900 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.269857883453369, + "eval_logits/rejected": -2.085240125656128, + "eval_logps/chosen": -236.7635498046875, + "eval_logps/rejected": -224.1826629638672, + "eval_loss": 0.6899347901344299, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": -0.047585878521203995, + "eval_rewards/margins": 0.07812169939279556, + "eval_rewards/rejected": -0.12570756673812866, + "eval_runtime": 710.7257, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 7900 + }, + { + "epoch": 0.52, + "learning_rate": 2.782889842303926e-06, + "logits/chosen": -2.2479918003082275, + "logits/rejected": -2.0780441761016846, + "logps/chosen": -169.74075317382812, + "logps/rejected": -166.49923706054688, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09511653333902359, + "rewards/margins": 0.022563491016626358, + "rewards/rejected": -0.11768001317977905, + "step": 7910 + }, + { + "epoch": 0.52, + "learning_rate": 2.7772158724559987e-06, + "logits/chosen": -2.092353105545044, + "logits/rejected": -1.937853217124939, + "logps/chosen": -221.2228546142578, + "logps/rejected": -273.72015380859375, + "loss": 0.6844, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.041138116270303726, + "rewards/margins": 0.15026527643203735, + "rewards/rejected": -0.1914033740758896, + "step": 7920 + }, + { + "epoch": 0.52, + "learning_rate": 2.7715404565142856e-06, + "logits/chosen": -2.250084161758423, + "logits/rejected": -2.1173858642578125, + "logps/chosen": -204.4228515625, + "logps/rejected": -204.76995849609375, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0445866696536541, + "rewards/margins": 0.06569734215736389, + "rewards/rejected": -0.11028401553630829, + "step": 7930 + }, + { + "epoch": 0.52, + "learning_rate": 2.7658636240845354e-06, + "logits/chosen": -2.4091989994049072, + "logits/rejected": -2.306553363800049, + "logps/chosen": -226.8249969482422, + "logps/rejected": -250.21707153320312, + "loss": 0.6904, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03376641497015953, + "rewards/margins": 0.08774002641439438, + "rewards/rejected": -0.12150643765926361, + "step": 7940 + }, + { + "epoch": 0.52, + "learning_rate": 2.7601854047798872e-06, + "logits/chosen": -2.1954774856567383, + "logits/rejected": -2.236949920654297, + "logps/chosen": -228.4868621826172, + "logps/rejected": -254.75991821289062, + "loss": 0.6892, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.045634832233190536, + "rewards/margins": 0.06857772916555405, + "rewards/rejected": -0.11421255767345428, + "step": 7950 + }, + { + "epoch": 0.52, + "learning_rate": 2.7545058282207148e-06, + "logits/chosen": -2.3288655281066895, + "logits/rejected": -1.9428226947784424, + "logps/chosen": -215.27224731445312, + "logps/rejected": -191.93368530273438, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03730421140789986, + "rewards/margins": 0.07240404933691025, + "rewards/rejected": -0.1097082644701004, + "step": 7960 + }, + { + "epoch": 0.52, + "learning_rate": 2.748824924034471e-06, + "logits/chosen": -2.2552783489227295, + "logits/rejected": -2.120013475418091, + "logps/chosen": -226.8047637939453, + "logps/rejected": -217.0663299560547, + "loss": 0.6891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07394564896821976, + "rewards/margins": 0.09158217161893845, + "rewards/rejected": -0.1655278354883194, + "step": 7970 + }, + { + "epoch": 0.52, + "learning_rate": 2.743142721855536e-06, + "logits/chosen": -2.1140682697296143, + "logits/rejected": -2.0926127433776855, + "logps/chosen": -157.36866760253906, + "logps/rejected": -154.32342529296875, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05508549138903618, + "rewards/margins": 0.053032286465168, + "rewards/rejected": -0.10811777412891388, + "step": 7980 + }, + { + "epoch": 0.52, + "learning_rate": 2.737459251325058e-06, + "logits/chosen": -2.23268985748291, + "logits/rejected": -2.1779227256774902, + "logps/chosen": -271.97705078125, + "logps/rejected": -255.03189086914062, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020035061985254288, + "rewards/margins": 0.055861860513687134, + "rewards/rejected": -0.07589691877365112, + "step": 7990 + }, + { + "epoch": 0.52, + "learning_rate": 2.731774542090804e-06, + "logits/chosen": -2.1980490684509277, + "logits/rejected": -1.7955074310302734, + "logps/chosen": -196.6571044921875, + "logps/rejected": -182.37567138671875, + "loss": 0.6911, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.03881066292524338, + "rewards/margins": 0.04753577709197998, + "rewards/rejected": -0.08634644001722336, + "step": 8000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.265841245651245, + "eval_logits/rejected": -2.081429958343506, + "eval_logps/chosen": -235.2987060546875, + "eval_logps/rejected": -223.01144409179688, + "eval_loss": 0.6899079084396362, + "eval_rewards/accuracies": 0.6345000267028809, + "eval_rewards/chosen": -0.03293740004301071, + "eval_rewards/margins": 0.08105786144733429, + "eval_rewards/rejected": -0.113995261490345, + "eval_runtime": 710.8315, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 8000 + }, + { + "epoch": 0.52, + "learning_rate": 2.7260886238070034e-06, + "logits/chosen": -2.271594524383545, + "logits/rejected": -2.1849944591522217, + "logps/chosen": -198.82374572753906, + "logps/rejected": -202.0972442626953, + "loss": 0.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.023537836968898773, + "rewards/margins": 0.07981632649898529, + "rewards/rejected": -0.10335417091846466, + "step": 8010 + }, + { + "epoch": 0.52, + "learning_rate": 2.72040152613419e-06, + "logits/chosen": -2.2961008548736572, + "logits/rejected": -1.8004083633422852, + "logps/chosen": -219.9088897705078, + "logps/rejected": -149.8590087890625, + "loss": 0.6852, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03033808246254921, + "rewards/margins": 0.13175645470619202, + "rewards/rejected": -0.16209453344345093, + "step": 8020 + }, + { + "epoch": 0.53, + "learning_rate": 2.7147132787390516e-06, + "logits/chosen": -2.286135196685791, + "logits/rejected": -1.993798851966858, + "logps/chosen": -229.1460418701172, + "logps/rejected": -220.80264282226562, + "loss": 0.6912, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01667933538556099, + "rewards/margins": 0.08642159402370453, + "rewards/rejected": -0.10310093313455582, + "step": 8030 + }, + { + "epoch": 0.53, + "learning_rate": 2.709023911294273e-06, + "logits/chosen": -2.374183177947998, + "logits/rejected": -1.9083032608032227, + "logps/chosen": -242.0371551513672, + "logps/rejected": -225.60302734375, + "loss": 0.6868, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.002816893858835101, + "rewards/margins": 0.14465411007404327, + "rewards/rejected": -0.1418372094631195, + "step": 8040 + }, + { + "epoch": 0.53, + "learning_rate": 2.7033334534783806e-06, + "logits/chosen": -2.263575792312622, + "logits/rejected": -2.3582472801208496, + "logps/chosen": -200.14503479003906, + "logps/rejected": -225.3608856201172, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009991501457989216, + "rewards/margins": 0.09016770124435425, + "rewards/rejected": -0.10015920549631119, + "step": 8050 + }, + { + "epoch": 0.53, + "learning_rate": 2.697641934975592e-06, + "logits/chosen": -2.2752552032470703, + "logits/rejected": -2.050177574157715, + "logps/chosen": -229.1072998046875, + "logps/rejected": -205.2366180419922, + "loss": 0.6883, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.013208845630288124, + "rewards/margins": 0.08683110773563385, + "rewards/rejected": -0.10003993660211563, + "step": 8060 + }, + { + "epoch": 0.53, + "learning_rate": 2.691949385475654e-06, + "logits/chosen": -2.3117451667785645, + "logits/rejected": -2.063112258911133, + "logps/chosen": -246.4084930419922, + "logps/rejected": -229.3635711669922, + "loss": 0.6892, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02517673373222351, + "rewards/margins": 0.0852632075548172, + "rewards/rejected": -0.11043993383646011, + "step": 8070 + }, + { + "epoch": 0.53, + "learning_rate": 2.6862558346736937e-06, + "logits/chosen": -2.239243268966675, + "logits/rejected": -2.0910251140594482, + "logps/chosen": -241.4723663330078, + "logps/rejected": -251.2477264404297, + "loss": 0.6856, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.031201153993606567, + "rewards/margins": 0.14977136254310608, + "rewards/rejected": -0.18097251653671265, + "step": 8080 + }, + { + "epoch": 0.53, + "learning_rate": 2.6805613122700617e-06, + "logits/chosen": -2.282254457473755, + "logits/rejected": -1.951345682144165, + "logps/chosen": -227.823486328125, + "logps/rejected": -238.18466186523438, + "loss": 0.688, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04250997677445412, + "rewards/margins": 0.09419043362140656, + "rewards/rejected": -0.13670040667057037, + "step": 8090 + }, + { + "epoch": 0.53, + "learning_rate": 2.674865847970176e-06, + "logits/chosen": -2.219407320022583, + "logits/rejected": -1.950874924659729, + "logps/chosen": -209.02536010742188, + "logps/rejected": -239.494384765625, + "loss": 0.6915, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.014089837670326233, + "rewards/margins": 0.08020684868097305, + "rewards/rejected": -0.09429670870304108, + "step": 8100 + }, + { + "epoch": 0.53, + "eval_logits/chosen": -2.2729127407073975, + "eval_logits/rejected": -2.08774733543396, + "eval_logps/chosen": -233.5811309814453, + "eval_logps/rejected": -221.25350952148438, + "eval_loss": 0.6898881793022156, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": -0.01576184667646885, + "eval_rewards/margins": 0.08065415918827057, + "eval_rewards/rejected": -0.09641600400209427, + "eval_runtime": 710.7109, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 8100 + }, + { + "epoch": 0.53, + "learning_rate": 2.669169471484368e-06, + "logits/chosen": -2.0301496982574463, + "logits/rejected": -2.0818302631378174, + "logps/chosen": -168.10691833496094, + "logps/rejected": -169.706298828125, + "loss": 0.6909, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03048858605325222, + "rewards/margins": 0.04239571467041969, + "rewards/rejected": -0.07288429886102676, + "step": 8110 + }, + { + "epoch": 0.53, + "learning_rate": 2.6634722125277278e-06, + "logits/chosen": -2.373579502105713, + "logits/rejected": -2.051607370376587, + "logps/chosen": -235.17337036132812, + "logps/rejected": -251.06521606445312, + "loss": 0.6887, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.030534693971276283, + "rewards/margins": 0.076970174908638, + "rewards/rejected": -0.10750486701726913, + "step": 8120 + }, + { + "epoch": 0.53, + "learning_rate": 2.6577741008199498e-06, + "logits/chosen": -2.2919061183929443, + "logits/rejected": -1.9393638372421265, + "logps/chosen": -256.58197021484375, + "logps/rejected": -211.50302124023438, + "loss": 0.6866, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0042382776737213135, + "rewards/margins": 0.1496235430240631, + "rewards/rejected": -0.15386183559894562, + "step": 8130 + }, + { + "epoch": 0.53, + "learning_rate": 2.652075166085175e-06, + "logits/chosen": -2.1796364784240723, + "logits/rejected": -2.135631561279297, + "logps/chosen": -235.9457244873047, + "logps/rejected": -273.9045715332031, + "loss": 0.6874, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.013664024882018566, + "rewards/margins": 0.14413678646087646, + "rewards/rejected": -0.1578008234500885, + "step": 8140 + }, + { + "epoch": 0.53, + "learning_rate": 2.6463754380518395e-06, + "logits/chosen": -2.159858465194702, + "logits/rejected": -1.9525247812271118, + "logps/chosen": -236.5515899658203, + "logps/rejected": -190.86343383789062, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05593949556350708, + "rewards/margins": 0.09138914197683334, + "rewards/rejected": -0.14732863008975983, + "step": 8150 + }, + { + "epoch": 0.53, + "learning_rate": 2.6406749464525167e-06, + "logits/chosen": -2.2781941890716553, + "logits/rejected": -1.997957468032837, + "logps/chosen": -233.92446899414062, + "logps/rejected": -196.24688720703125, + "loss": 0.6901, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0036535251419991255, + "rewards/margins": 0.09554257988929749, + "rewards/rejected": -0.09919609874486923, + "step": 8160 + }, + { + "epoch": 0.53, + "learning_rate": 2.634973721023762e-06, + "logits/chosen": -2.3184168338775635, + "logits/rejected": -2.161243200302124, + "logps/chosen": -258.32891845703125, + "logps/rejected": -215.8189239501953, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03866691142320633, + "rewards/margins": 0.06928583234548569, + "rewards/rejected": -0.10795273631811142, + "step": 8170 + }, + { + "epoch": 0.54, + "learning_rate": 2.6292717915059605e-06, + "logits/chosen": -2.3634932041168213, + "logits/rejected": -2.096513032913208, + "logps/chosen": -278.8802490234375, + "logps/rejected": -233.5529022216797, + "loss": 0.6893, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03250148147344589, + "rewards/margins": 0.10865737497806549, + "rewards/rejected": -0.1411588490009308, + "step": 8180 + }, + { + "epoch": 0.54, + "learning_rate": 2.6235691876431706e-06, + "logits/chosen": -2.171806812286377, + "logits/rejected": -2.2218527793884277, + "logps/chosen": -221.06301879882812, + "logps/rejected": -234.08663940429688, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02447427250444889, + "rewards/margins": 0.07424478232860565, + "rewards/rejected": -0.09871906042098999, + "step": 8190 + }, + { + "epoch": 0.54, + "learning_rate": 2.6178659391829673e-06, + "logits/chosen": -2.3931174278259277, + "logits/rejected": -2.1151492595672607, + "logps/chosen": -236.8017120361328, + "logps/rejected": -204.3024139404297, + "loss": 0.6907, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006762659642845392, + "rewards/margins": 0.08343976736068726, + "rewards/rejected": -0.0766771137714386, + "step": 8200 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -2.2691376209259033, + "eval_logits/rejected": -2.084272861480713, + "eval_logps/chosen": -234.5026397705078, + "eval_logps/rejected": -222.2466278076172, + "eval_loss": 0.6898766160011292, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": -0.02497694082558155, + "eval_rewards/margins": 0.08137031644582748, + "eval_rewards/rejected": -0.10634726285934448, + "eval_runtime": 711.535, + "eval_samples_per_second": 2.811, + "eval_steps_per_second": 1.405, + "step": 8200 + }, + { + "epoch": 0.54, + "learning_rate": 2.6121620758762877e-06, + "logits/chosen": -2.2570462226867676, + "logits/rejected": -2.002037525177002, + "logps/chosen": -194.84579467773438, + "logps/rejected": -200.37400817871094, + "loss": 0.6911, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.027614828199148178, + "rewards/margins": 0.06327278167009354, + "rewards/rejected": -0.09088762104511261, + "step": 8210 + }, + { + "epoch": 0.54, + "learning_rate": 2.606457627477277e-06, + "logits/chosen": -2.1911864280700684, + "logits/rejected": -2.134552478790283, + "logps/chosen": -176.81307983398438, + "logps/rejected": -189.58029174804688, + "loss": 0.6916, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.011837871745228767, + "rewards/margins": 0.09415124356746674, + "rewards/rejected": -0.10598911345005035, + "step": 8220 + }, + { + "epoch": 0.54, + "learning_rate": 2.6007526237431324e-06, + "logits/chosen": -2.330580234527588, + "logits/rejected": -2.279081344604492, + "logps/chosen": -182.9707489013672, + "logps/rejected": -206.08935546875, + "loss": 0.6893, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.009412150830030441, + "rewards/margins": 0.0900057703256607, + "rewards/rejected": -0.09941791743040085, + "step": 8230 + }, + { + "epoch": 0.54, + "learning_rate": 2.5950470944339478e-06, + "logits/chosen": -2.110105514526367, + "logits/rejected": -2.1718857288360596, + "logps/chosen": -220.61978149414062, + "logps/rejected": -220.62734985351562, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01182152982801199, + "rewards/margins": 0.03503318130970001, + "rewards/rejected": -0.023211652413010597, + "step": 8240 + }, + { + "epoch": 0.54, + "learning_rate": 2.58934106931256e-06, + "logits/chosen": -2.2483153343200684, + "logits/rejected": -1.9538730382919312, + "logps/chosen": -221.7985382080078, + "logps/rejected": -208.62557983398438, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010264934040606022, + "rewards/margins": 0.06004582718014717, + "rewards/rejected": -0.07031075656414032, + "step": 8250 + }, + { + "epoch": 0.54, + "learning_rate": 2.58363457814439e-06, + "logits/chosen": -2.243074417114258, + "logits/rejected": -1.955980896949768, + "logps/chosen": -214.344970703125, + "logps/rejected": -209.1015625, + "loss": 0.6878, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03673034533858299, + "rewards/margins": 0.0848006159067154, + "rewards/rejected": -0.12153096497058868, + "step": 8260 + }, + { + "epoch": 0.54, + "learning_rate": 2.5779276506972924e-06, + "logits/chosen": -2.2136752605438232, + "logits/rejected": -2.1928346157073975, + "logps/chosen": -233.85415649414062, + "logps/rejected": -200.51458740234375, + "loss": 0.6912, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.019013002514839172, + "rewards/margins": 0.0604260191321373, + "rewards/rejected": -0.07943902909755707, + "step": 8270 + }, + { + "epoch": 0.54, + "learning_rate": 2.5722203167413945e-06, + "logits/chosen": -2.336066961288452, + "logits/rejected": -2.0090115070343018, + "logps/chosen": -284.7717590332031, + "logps/rejected": -210.93600463867188, + "loss": 0.6896, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.014740842394530773, + "rewards/margins": 0.08085087686777115, + "rewards/rejected": -0.0955917239189148, + "step": 8280 + }, + { + "epoch": 0.54, + "learning_rate": 2.5665126060489476e-06, + "logits/chosen": -2.30047607421875, + "logits/rejected": -2.150911569595337, + "logps/chosen": -190.69834899902344, + "logps/rejected": -225.66976928710938, + "loss": 0.6904, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02596314251422882, + "rewards/margins": 0.06124822050333023, + "rewards/rejected": -0.08721135556697845, + "step": 8290 + }, + { + "epoch": 0.54, + "learning_rate": 2.560804548394165e-06, + "logits/chosen": -2.222855567932129, + "logits/rejected": -1.9643363952636719, + "logps/chosen": -249.7584228515625, + "logps/rejected": -214.04666137695312, + "loss": 0.6893, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02478059008717537, + "rewards/margins": 0.0956047847867012, + "rewards/rejected": -0.12038537114858627, + "step": 8300 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -2.2778053283691406, + "eval_logits/rejected": -2.0923290252685547, + "eval_logps/chosen": -232.2015380859375, + "eval_logps/rejected": -219.4079132080078, + "eval_loss": 0.6899590492248535, + "eval_rewards/accuracies": 0.6345000267028809, + "eval_rewards/chosen": -0.0019656550139188766, + "eval_rewards/margins": 0.07599426060914993, + "eval_rewards/rejected": -0.07795991748571396, + "eval_runtime": 710.1344, + "eval_samples_per_second": 2.816, + "eval_steps_per_second": 1.408, + "step": 8300 + }, + { + "epoch": 0.54, + "learning_rate": 2.5550961735530734e-06, + "logits/chosen": -2.106081008911133, + "logits/rejected": -2.282960891723633, + "logps/chosen": -161.3715362548828, + "logps/rejected": -198.7847900390625, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016642851755023003, + "rewards/margins": 0.06822551786899567, + "rewards/rejected": -0.051582664251327515, + "step": 8310 + }, + { + "epoch": 0.54, + "learning_rate": 2.549387511303351e-06, + "logits/chosen": -2.265373706817627, + "logits/rejected": -2.3061885833740234, + "logps/chosen": -168.9114990234375, + "logps/rejected": -219.42587280273438, + "loss": 0.6896, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.008253363892436028, + "rewards/margins": 0.062337376177310944, + "rewards/rejected": -0.05408401042222977, + "step": 8320 + }, + { + "epoch": 0.55, + "learning_rate": 2.5436785914241774e-06, + "logits/chosen": -2.2159152030944824, + "logits/rejected": -2.2393479347229004, + "logps/chosen": -200.00836181640625, + "logps/rejected": -181.49374389648438, + "loss": 0.6869, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01049137394875288, + "rewards/margins": 0.1345503181219101, + "rewards/rejected": -0.14504170417785645, + "step": 8330 + }, + { + "epoch": 0.55, + "learning_rate": 2.5379694436960746e-06, + "logits/chosen": -2.3889286518096924, + "logits/rejected": -2.1922972202301025, + "logps/chosen": -243.3367919921875, + "logps/rejected": -261.3111877441406, + "loss": 0.6911, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.026742050424218178, + "rewards/margins": 0.07409554719924927, + "rewards/rejected": -0.047353483736515045, + "step": 8340 + }, + { + "epoch": 0.55, + "learning_rate": 2.5322600979007533e-06, + "logits/chosen": -2.403104305267334, + "logits/rejected": -2.162173271179199, + "logps/chosen": -212.3261260986328, + "logps/rejected": -199.4026641845703, + "loss": 0.6898, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0016930897254496813, + "rewards/margins": 0.07775741815567017, + "rewards/rejected": -0.0760643258690834, + "step": 8350 + }, + { + "epoch": 0.55, + "learning_rate": 2.5265505838209592e-06, + "logits/chosen": -2.4180874824523926, + "logits/rejected": -2.0715489387512207, + "logps/chosen": -256.7772521972656, + "logps/rejected": -215.9575958251953, + "loss": 0.6926, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0018636692548170686, + "rewards/margins": 0.055936507880687714, + "rewards/rejected": -0.05407283455133438, + "step": 8360 + }, + { + "epoch": 0.55, + "learning_rate": 2.520840931240314e-06, + "logits/chosen": -2.448770046234131, + "logits/rejected": -1.9609102010726929, + "logps/chosen": -208.07290649414062, + "logps/rejected": -152.3336639404297, + "loss": 0.6915, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.017356865108013153, + "rewards/margins": 0.07981632649898529, + "rewards/rejected": -0.06245948001742363, + "step": 8370 + }, + { + "epoch": 0.55, + "learning_rate": 2.515131169943162e-06, + "logits/chosen": -1.9940284490585327, + "logits/rejected": -2.0735738277435303, + "logps/chosen": -258.408203125, + "logps/rejected": -259.9877624511719, + "loss": 0.6911, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0031367135234177113, + "rewards/margins": 0.08551900088787079, + "rewards/rejected": -0.08865571022033691, + "step": 8380 + }, + { + "epoch": 0.55, + "learning_rate": 2.509421329714416e-06, + "logits/chosen": -2.1275012493133545, + "logits/rejected": -2.1602554321289062, + "logps/chosen": -206.140869140625, + "logps/rejected": -231.0389862060547, + "loss": 0.6922, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.6808509826660156e-05, + "rewards/margins": 0.03706061840057373, + "rewards/rejected": -0.03707743063569069, + "step": 8390 + }, + { + "epoch": 0.55, + "learning_rate": 2.5037114403393987e-06, + "logits/chosen": -2.247596263885498, + "logits/rejected": -1.9996188879013062, + "logps/chosen": -209.26681518554688, + "logps/rejected": -182.84060668945312, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.017135795205831528, + "rewards/margins": 0.07354326546192169, + "rewards/rejected": -0.05640747398138046, + "step": 8400 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -2.2804696559906006, + "eval_logits/rejected": -2.095289468765259, + "eval_logps/chosen": -230.7716522216797, + "eval_logps/rejected": -217.13856506347656, + "eval_loss": 0.6900351643562317, + "eval_rewards/accuracies": 0.6294999718666077, + "eval_rewards/chosen": 0.012332833372056484, + "eval_rewards/margins": 0.0675993338227272, + "eval_rewards/rejected": -0.055266499519348145, + "eval_runtime": 713.3682, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 8400 + }, + { + "epoch": 0.55, + "learning_rate": 2.4980015316036908e-06, + "logits/chosen": -2.116654872894287, + "logits/rejected": -2.1679673194885254, + "logps/chosen": -173.55227661132812, + "logps/rejected": -206.0382080078125, + "loss": 0.6871, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022574327886104584, + "rewards/margins": 0.10900095850229263, + "rewards/rejected": -0.08642663061618805, + "step": 8410 + }, + { + "epoch": 0.55, + "learning_rate": 2.4922916332929725e-06, + "logits/chosen": -2.4510018825531006, + "logits/rejected": -2.1898789405822754, + "logps/chosen": -234.3470916748047, + "logps/rejected": -197.39511108398438, + "loss": 0.6919, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.010771475732326508, + "rewards/margins": 0.02662494219839573, + "rewards/rejected": -0.015853462740778923, + "step": 8420 + }, + { + "epoch": 0.55, + "learning_rate": 2.4865817751928716e-06, + "logits/chosen": -2.1895623207092285, + "logits/rejected": -2.174008369445801, + "logps/chosen": -193.5983123779297, + "logps/rejected": -231.7257537841797, + "loss": 0.6863, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.018903840333223343, + "rewards/margins": 0.11705265939235687, + "rewards/rejected": -0.09814882278442383, + "step": 8430 + }, + { + "epoch": 0.55, + "learning_rate": 2.4808719870888037e-06, + "logits/chosen": -2.0574288368225098, + "logits/rejected": -1.983668565750122, + "logps/chosen": -216.3809356689453, + "logps/rejected": -193.36599731445312, + "loss": 0.6896, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.012905344367027283, + "rewards/margins": 0.10258817672729492, + "rewards/rejected": -0.08968283236026764, + "step": 8440 + }, + { + "epoch": 0.55, + "learning_rate": 2.4751622987658206e-06, + "logits/chosen": -2.475071430206299, + "logits/rejected": -2.2941946983337402, + "logps/chosen": -235.42440795898438, + "logps/rejected": -230.3686065673828, + "loss": 0.6918, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.025258636102080345, + "rewards/margins": 0.05684860795736313, + "rewards/rejected": -0.03158997744321823, + "step": 8450 + }, + { + "epoch": 0.55, + "learning_rate": 2.4694527400084546e-06, + "logits/chosen": -2.25466251373291, + "logits/rejected": -2.161506175994873, + "logps/chosen": -222.21908569335938, + "logps/rejected": -224.0065155029297, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0002555936516728252, + "rewards/margins": 0.07632071524858475, + "rewards/rejected": -0.07606511563062668, + "step": 8460 + }, + { + "epoch": 0.55, + "learning_rate": 2.4637433406005607e-06, + "logits/chosen": -2.4585928916931152, + "logits/rejected": -2.344909191131592, + "logps/chosen": -310.13995361328125, + "logps/rejected": -286.87567138671875, + "loss": 0.691, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.004136279225349426, + "rewards/margins": 0.04778838902711868, + "rewards/rejected": -0.05192466825246811, + "step": 8470 + }, + { + "epoch": 0.55, + "learning_rate": 2.4580341303251628e-06, + "logits/chosen": -2.2639238834381104, + "logits/rejected": -2.002631425857544, + "logps/chosen": -259.3542175292969, + "logps/rejected": -229.9150848388672, + "loss": 0.6899, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.005969688296318054, + "rewards/margins": 0.06959638744592667, + "rewards/rejected": -0.06362669169902802, + "step": 8480 + }, + { + "epoch": 0.56, + "learning_rate": 2.4523251389642984e-06, + "logits/chosen": -2.16398024559021, + "logits/rejected": -2.036417007446289, + "logps/chosen": -256.45709228515625, + "logps/rejected": -229.12576293945312, + "loss": 0.6877, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018977751955389977, + "rewards/margins": 0.09807170927524567, + "rewards/rejected": -0.11704947054386139, + "step": 8490 + }, + { + "epoch": 0.56, + "learning_rate": 2.4466163962988626e-06, + "logits/chosen": -2.480299711227417, + "logits/rejected": -2.1086299419403076, + "logps/chosen": -281.618896484375, + "logps/rejected": -193.210205078125, + "loss": 0.6885, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.010179015807807446, + "rewards/margins": 0.10310627520084381, + "rewards/rejected": -0.11328530311584473, + "step": 8500 + }, + { + "epoch": 0.56, + "eval_logits/chosen": -2.2819478511810303, + "eval_logits/rejected": -2.0963072776794434, + "eval_logps/chosen": -231.94549560546875, + "eval_logps/rejected": -220.13168334960938, + "eval_loss": 0.689839780330658, + "eval_rewards/accuracies": 0.6455000042915344, + "eval_rewards/chosen": 0.0005945622688159347, + "eval_rewards/margins": 0.08579233288764954, + "eval_rewards/rejected": -0.08519777655601501, + "eval_runtime": 712.4374, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 8500 + }, + { + "epoch": 0.56, + "learning_rate": 2.4409079321084543e-06, + "logits/chosen": -2.2277088165283203, + "logits/rejected": -2.284764051437378, + "logps/chosen": -213.2277374267578, + "logps/rejected": -252.33645629882812, + "loss": 0.6907, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.01891925372183323, + "rewards/margins": 0.0916977971792221, + "rewards/rejected": -0.07277854532003403, + "step": 8510 + }, + { + "epoch": 0.56, + "learning_rate": 2.4351997761712184e-06, + "logits/chosen": -2.4851880073547363, + "logits/rejected": -2.031656265258789, + "logps/chosen": -244.4697265625, + "logps/rejected": -189.30319213867188, + "loss": 0.6895, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.009185681119561195, + "rewards/margins": 0.11539344489574432, + "rewards/rejected": -0.10620777308940887, + "step": 8520 + }, + { + "epoch": 0.56, + "learning_rate": 2.4294919582636933e-06, + "logits/chosen": -2.274225950241089, + "logits/rejected": -2.129521369934082, + "logps/chosen": -209.3656768798828, + "logps/rejected": -206.8007354736328, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.017368298023939133, + "rewards/margins": 0.09274449944496155, + "rewards/rejected": -0.07537619769573212, + "step": 8530 + }, + { + "epoch": 0.56, + "learning_rate": 2.423784508160652e-06, + "logits/chosen": -2.352238655090332, + "logits/rejected": -2.100398540496826, + "logps/chosen": -256.19207763671875, + "logps/rejected": -215.8179168701172, + "loss": 0.6912, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.009613009169697762, + "rewards/margins": 0.07463245093822479, + "rewards/rejected": -0.084245465695858, + "step": 8540 + }, + { + "epoch": 0.56, + "learning_rate": 2.418077455634951e-06, + "logits/chosen": -2.176546573638916, + "logits/rejected": -2.22251558303833, + "logps/chosen": -218.07138061523438, + "logps/rejected": -250.1627655029297, + "loss": 0.6917, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.015002986416220665, + "rewards/margins": 0.045342180877923965, + "rewards/rejected": -0.06034516543149948, + "step": 8550 + }, + { + "epoch": 0.56, + "learning_rate": 2.4123708304573714e-06, + "logits/chosen": -2.3653371334075928, + "logits/rejected": -2.2171475887298584, + "logps/chosen": -288.91107177734375, + "logps/rejected": -280.00177001953125, + "loss": 0.6899, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008562428876757622, + "rewards/margins": 0.07789406925439835, + "rewards/rejected": -0.06933163106441498, + "step": 8560 + }, + { + "epoch": 0.56, + "learning_rate": 2.406664662396465e-06, + "logits/chosen": -2.1397430896759033, + "logits/rejected": -1.9881635904312134, + "logps/chosen": -188.5435333251953, + "logps/rejected": -179.0817108154297, + "loss": 0.691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04161657765507698, + "rewards/margins": 0.06220381706953049, + "rewards/rejected": -0.10382040590047836, + "step": 8570 + }, + { + "epoch": 0.56, + "learning_rate": 2.4009589812184012e-06, + "logits/chosen": -2.3080785274505615, + "logits/rejected": -1.9249913692474365, + "logps/chosen": -205.11972045898438, + "logps/rejected": -160.22409057617188, + "loss": 0.6887, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.018998144194483757, + "rewards/margins": 0.08215345442295074, + "rewards/rejected": -0.10115160048007965, + "step": 8580 + }, + { + "epoch": 0.56, + "learning_rate": 2.3952538166868073e-06, + "logits/chosen": -2.0843875408172607, + "logits/rejected": -2.144876480102539, + "logps/chosen": -218.38809204101562, + "logps/rejected": -214.17666625976562, + "loss": 0.6871, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.008417466655373573, + "rewards/margins": 0.1238173246383667, + "rewards/rejected": -0.13223478198051453, + "step": 8590 + }, + { + "epoch": 0.56, + "learning_rate": 2.389549198562616e-06, + "logits/chosen": -2.282944917678833, + "logits/rejected": -1.855536699295044, + "logps/chosen": -225.4339599609375, + "logps/rejected": -205.1066131591797, + "loss": 0.6889, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0064767589792609215, + "rewards/margins": 0.09652809798717499, + "rewards/rejected": -0.09005134552717209, + "step": 8600 + }, + { + "epoch": 0.56, + "eval_logits/chosen": -2.289522886276245, + "eval_logits/rejected": -2.1033294200897217, + "eval_logps/chosen": -232.30738830566406, + "eval_logps/rejected": -220.40335083007812, + "eval_loss": 0.6898036003112793, + "eval_rewards/accuracies": 0.640999972820282, + "eval_rewards/chosen": -0.0030244227964431047, + "eval_rewards/margins": 0.08489015698432922, + "eval_rewards/rejected": -0.08791457116603851, + "eval_runtime": 714.176, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 8600 + }, + { + "epoch": 0.56, + "learning_rate": 2.3838451566039098e-06, + "logits/chosen": -2.309410333633423, + "logits/rejected": -2.1341471672058105, + "logps/chosen": -240.350830078125, + "logps/rejected": -234.599365234375, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018408339470624924, + "rewards/margins": 0.04382626712322235, + "rewards/rejected": -0.062234602868556976, + "step": 8610 + }, + { + "epoch": 0.56, + "learning_rate": 2.3781417205657662e-06, + "logits/chosen": -2.3165881633758545, + "logits/rejected": -2.01545786857605, + "logps/chosen": -197.41787719726562, + "logps/rejected": -167.50404357910156, + "loss": 0.6907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.003385829506441951, + "rewards/margins": 0.08293353766202927, + "rewards/rejected": -0.0795477032661438, + "step": 8620 + }, + { + "epoch": 0.56, + "learning_rate": 2.3724389202001006e-06, + "logits/chosen": -2.3407962322235107, + "logits/rejected": -2.0748586654663086, + "logps/chosen": -203.34979248046875, + "logps/rejected": -187.19520568847656, + "loss": 0.692, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.010263567790389061, + "rewards/margins": 0.055420707911252975, + "rewards/rejected": -0.06568428128957748, + "step": 8630 + }, + { + "epoch": 0.57, + "learning_rate": 2.366736785255514e-06, + "logits/chosen": -2.227527141571045, + "logits/rejected": -2.1566596031188965, + "logps/chosen": -200.77955627441406, + "logps/rejected": -196.5257110595703, + "loss": 0.6903, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.020507413893938065, + "rewards/margins": 0.06966142356395721, + "rewards/rejected": -0.09016883373260498, + "step": 8640 + }, + { + "epoch": 0.57, + "learning_rate": 2.3610353454771355e-06, + "logits/chosen": -2.123077869415283, + "logits/rejected": -2.05281925201416, + "logps/chosen": -190.9486083984375, + "logps/rejected": -181.61386108398438, + "loss": 0.6914, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.01001174096018076, + "rewards/margins": 0.06959837675094604, + "rewards/rejected": -0.07961011677980423, + "step": 8650 + }, + { + "epoch": 0.57, + "learning_rate": 2.355334630606467e-06, + "logits/chosen": -2.493744373321533, + "logits/rejected": -2.0158677101135254, + "logps/chosen": -240.03988647460938, + "logps/rejected": -183.7025146484375, + "loss": 0.6905, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0025037326849997044, + "rewards/margins": 0.07208188623189926, + "rewards/rejected": -0.06957816332578659, + "step": 8660 + }, + { + "epoch": 0.57, + "learning_rate": 2.349634670381231e-06, + "logits/chosen": -2.0954480171203613, + "logits/rejected": -2.0449440479278564, + "logps/chosen": -208.32308959960938, + "logps/rejected": -224.8218231201172, + "loss": 0.6906, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01123981736600399, + "rewards/margins": 0.0670008659362793, + "rewards/rejected": -0.07824068516492844, + "step": 8670 + }, + { + "epoch": 0.57, + "learning_rate": 2.3439354945352104e-06, + "logits/chosen": -2.341536045074463, + "logits/rejected": -2.278677463531494, + "logps/chosen": -245.1410675048828, + "logps/rejected": -203.81253051757812, + "loss": 0.6923, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.01617489382624626, + "rewards/margins": 0.030692163854837418, + "rewards/rejected": -0.04686705023050308, + "step": 8680 + }, + { + "epoch": 0.57, + "learning_rate": 2.3382371327981e-06, + "logits/chosen": -2.2057578563690186, + "logits/rejected": -2.200843334197998, + "logps/chosen": -230.00222778320312, + "logps/rejected": -225.0457000732422, + "loss": 0.6897, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.014740320853888988, + "rewards/margins": 0.09338275343179703, + "rewards/rejected": -0.07864242792129517, + "step": 8690 + }, + { + "epoch": 0.57, + "learning_rate": 2.3325396148953456e-06, + "logits/chosen": -2.073983907699585, + "logits/rejected": -2.184781312942505, + "logps/chosen": -172.51876831054688, + "logps/rejected": -236.9097900390625, + "loss": 0.6895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.012173332273960114, + "rewards/margins": 0.10260754823684692, + "rewards/rejected": -0.11478088051080704, + "step": 8700 + }, + { + "epoch": 0.57, + "eval_logits/chosen": -2.2970077991485596, + "eval_logits/rejected": -2.1105120182037354, + "eval_logps/chosen": -230.84942626953125, + "eval_logps/rejected": -218.98678588867188, + "eval_loss": 0.6898258328437805, + "eval_rewards/accuracies": 0.6430000066757202, + "eval_rewards/chosen": 0.011555157601833344, + "eval_rewards/margins": 0.0853039100766182, + "eval_rewards/rejected": -0.07374875247478485, + "eval_runtime": 712.4551, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 8700 + }, + { + "epoch": 0.57, + "learning_rate": 2.3268429705479915e-06, + "logits/chosen": -2.4747602939605713, + "logits/rejected": -2.1206700801849365, + "logps/chosen": -222.29537963867188, + "logps/rejected": -189.93572998046875, + "loss": 0.6905, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.02517220936715603, + "rewards/margins": 0.08959371596574783, + "rewards/rejected": -0.06442151963710785, + "step": 8710 + }, + { + "epoch": 0.57, + "learning_rate": 2.3211472294725248e-06, + "logits/chosen": -2.3218271732330322, + "logits/rejected": -2.1841847896575928, + "logps/chosen": -212.7299346923828, + "logps/rejected": -209.63003540039062, + "loss": 0.6906, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03685241565108299, + "rewards/margins": 0.09567222744226456, + "rewards/rejected": -0.058819811791181564, + "step": 8720 + }, + { + "epoch": 0.57, + "learning_rate": 2.315452421380721e-06, + "logits/chosen": -2.2035815715789795, + "logits/rejected": -1.7523466348648071, + "logps/chosen": -253.510986328125, + "logps/rejected": -214.3549346923828, + "loss": 0.6887, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014333168976008892, + "rewards/margins": 0.08837004750967026, + "rewards/rejected": -0.07403689622879028, + "step": 8730 + }, + { + "epoch": 0.57, + "learning_rate": 2.3097585759794886e-06, + "logits/chosen": -2.2899675369262695, + "logits/rejected": -1.9071069955825806, + "logps/chosen": -251.5904541015625, + "logps/rejected": -201.7096710205078, + "loss": 0.6867, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.027508467435836792, + "rewards/margins": 0.13511213660240173, + "rewards/rejected": -0.10760366916656494, + "step": 8740 + }, + { + "epoch": 0.57, + "learning_rate": 2.3040657229707155e-06, + "logits/chosen": -2.304961681365967, + "logits/rejected": -2.1966376304626465, + "logps/chosen": -170.78793334960938, + "logps/rejected": -190.66493225097656, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012596605345606804, + "rewards/margins": 0.08321347087621689, + "rewards/rejected": -0.07061685621738434, + "step": 8750 + }, + { + "epoch": 0.57, + "learning_rate": 2.2983738920511104e-06, + "logits/chosen": -2.464939832687378, + "logits/rejected": -1.987932801246643, + "logps/chosen": -265.67718505859375, + "logps/rejected": -223.0201416015625, + "loss": 0.6913, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04115743562579155, + "rewards/margins": 0.07645048201084137, + "rewards/rejected": -0.03529305011034012, + "step": 8760 + }, + { + "epoch": 0.57, + "learning_rate": 2.2926831129120523e-06, + "logits/chosen": -2.120628833770752, + "logits/rejected": -2.073472499847412, + "logps/chosen": -232.22799682617188, + "logps/rejected": -209.72720336914062, + "loss": 0.6919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03563634306192398, + "rewards/margins": 0.0585489384829998, + "rewards/rejected": -0.02291259728372097, + "step": 8770 + }, + { + "epoch": 0.57, + "learning_rate": 2.2869934152394323e-06, + "logits/chosen": -2.321106195449829, + "logits/rejected": -2.0543007850646973, + "logps/chosen": -268.8101501464844, + "logps/rejected": -220.52853393554688, + "loss": 0.6887, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0015528525691479445, + "rewards/margins": 0.07320442795753479, + "rewards/rejected": -0.07165157794952393, + "step": 8780 + }, + { + "epoch": 0.58, + "learning_rate": 2.281304828713501e-06, + "logits/chosen": -2.2122366428375244, + "logits/rejected": -2.1436047554016113, + "logps/chosen": -231.771728515625, + "logps/rejected": -231.8656768798828, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.012155646458268166, + "rewards/margins": 0.07596425712108612, + "rewards/rejected": -0.0638086199760437, + "step": 8790 + }, + { + "epoch": 0.58, + "learning_rate": 2.275617383008711e-06, + "logits/chosen": -2.3027684688568115, + "logits/rejected": -2.2025887966156006, + "logps/chosen": -235.1232452392578, + "logps/rejected": -237.69161987304688, + "loss": 0.6913, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.003641972318291664, + "rewards/margins": 0.05236151069402695, + "rewards/rejected": -0.048719536513090134, + "step": 8800 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.304422616958618, + "eval_logits/rejected": -2.117183208465576, + "eval_logps/chosen": -229.04266357421875, + "eval_logps/rejected": -216.80625915527344, + "eval_loss": 0.6898345351219177, + "eval_rewards/accuracies": 0.6464999914169312, + "eval_rewards/chosen": 0.029622970148921013, + "eval_rewards/margins": 0.0815664604306221, + "eval_rewards/rejected": -0.05194348469376564, + "eval_runtime": 711.6478, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 8800 + }, + { + "epoch": 0.58, + "learning_rate": 2.269931107793567e-06, + "logits/chosen": -2.2280020713806152, + "logits/rejected": -2.136003255844116, + "logps/chosen": -208.7579803466797, + "logps/rejected": -222.9391326904297, + "loss": 0.6908, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.035223446786403656, + "rewards/margins": 0.060886919498443604, + "rewards/rejected": -0.02566346526145935, + "step": 8810 + }, + { + "epoch": 0.58, + "learning_rate": 2.2642460327304655e-06, + "logits/chosen": -2.1614763736724854, + "logits/rejected": -2.2132813930511475, + "logps/chosen": -240.7371063232422, + "logps/rejected": -232.02880859375, + "loss": 0.6905, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.038226418197155, + "rewards/margins": 0.07139203697443008, + "rewards/rejected": -0.03316562622785568, + "step": 8820 + }, + { + "epoch": 0.58, + "learning_rate": 2.258562187475543e-06, + "logits/chosen": -2.15134859085083, + "logits/rejected": -2.1060047149658203, + "logps/chosen": -225.7406005859375, + "logps/rejected": -195.5087890625, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.012063100934028625, + "rewards/margins": 0.07280053198337555, + "rewards/rejected": -0.060737431049346924, + "step": 8830 + }, + { + "epoch": 0.58, + "learning_rate": 2.2528796016785196e-06, + "logits/chosen": -2.197204828262329, + "logits/rejected": -2.018406391143799, + "logps/chosen": -187.3249053955078, + "logps/rejected": -208.0103759765625, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02593032643198967, + "rewards/margins": 0.11019430309534073, + "rewards/rejected": -0.08426396548748016, + "step": 8840 + }, + { + "epoch": 0.58, + "learning_rate": 2.247198304982548e-06, + "logits/chosen": -2.239647388458252, + "logits/rejected": -2.0400216579437256, + "logps/chosen": -159.90098571777344, + "logps/rejected": -163.93194580078125, + "loss": 0.6902, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.020705249160528183, + "rewards/margins": 0.07980392873287201, + "rewards/rejected": -0.05909866839647293, + "step": 8850 + }, + { + "epoch": 0.58, + "learning_rate": 2.2415183270240533e-06, + "logits/chosen": -2.512545108795166, + "logits/rejected": -2.268498182296753, + "logps/chosen": -197.340087890625, + "logps/rejected": -210.59591674804688, + "loss": 0.6887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01989520527422428, + "rewards/margins": 0.08950191736221313, + "rewards/rejected": -0.0696067214012146, + "step": 8860 + }, + { + "epoch": 0.58, + "learning_rate": 2.2358396974325837e-06, + "logits/chosen": -2.317462682723999, + "logits/rejected": -2.1121644973754883, + "logps/chosen": -239.705322265625, + "logps/rejected": -221.26022338867188, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025622522458434105, + "rewards/margins": 0.07821951061487198, + "rewards/rejected": -0.05259697511792183, + "step": 8870 + }, + { + "epoch": 0.58, + "learning_rate": 2.2301624458306525e-06, + "logits/chosen": -2.4108285903930664, + "logits/rejected": -2.1266798973083496, + "logps/chosen": -259.18951416015625, + "logps/rejected": -213.99862670898438, + "loss": 0.6911, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0015859712148085237, + "rewards/margins": 0.061502885073423386, + "rewards/rejected": -0.05991692468523979, + "step": 8880 + }, + { + "epoch": 0.58, + "learning_rate": 2.2244866018335855e-06, + "logits/chosen": -2.2686455249786377, + "logits/rejected": -2.2580018043518066, + "logps/chosen": -215.9559326171875, + "logps/rejected": -237.7769012451172, + "loss": 0.6912, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.011017450131475925, + "rewards/margins": 0.07413921505212784, + "rewards/rejected": -0.06312176585197449, + "step": 8890 + }, + { + "epoch": 0.58, + "learning_rate": 2.2188121950493648e-06, + "logits/chosen": -2.4074501991271973, + "logits/rejected": -2.0326874256134033, + "logps/chosen": -219.9821319580078, + "logps/rejected": -146.60345458984375, + "loss": 0.6906, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01238707359880209, + "rewards/margins": 0.0487365797162056, + "rewards/rejected": -0.06112365052103996, + "step": 8900 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.305030584335327, + "eval_logits/rejected": -2.1172640323638916, + "eval_logps/chosen": -231.61558532714844, + "eval_logps/rejected": -220.36143493652344, + "eval_loss": 0.6898223757743835, + "eval_rewards/accuracies": 0.6485000252723694, + "eval_rewards/chosen": 0.003893795656040311, + "eval_rewards/margins": 0.09138916432857513, + "eval_rewards/rejected": -0.08749537914991379, + "eval_runtime": 712.4849, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 8900 + }, + { + "epoch": 0.58, + "learning_rate": 2.2131392550784766e-06, + "logits/chosen": -2.4283366203308105, + "logits/rejected": -1.8540500402450562, + "logps/chosen": -285.9861145019531, + "logps/rejected": -198.90310668945312, + "loss": 0.6923, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011074522510170937, + "rewards/margins": 0.08481260389089584, + "rewards/rejected": -0.09588713943958282, + "step": 8910 + }, + { + "epoch": 0.58, + "learning_rate": 2.2074678115137533e-06, + "logits/chosen": -2.1023287773132324, + "logits/rejected": -2.0058627128601074, + "logps/chosen": -195.75587463378906, + "logps/rejected": -213.19921875, + "loss": 0.6869, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.010389198549091816, + "rewards/margins": 0.12526783347129822, + "rewards/rejected": -0.13565704226493835, + "step": 8920 + }, + { + "epoch": 0.58, + "learning_rate": 2.201797893940224e-06, + "logits/chosen": -2.190784454345703, + "logits/rejected": -1.9884440898895264, + "logps/chosen": -232.9307403564453, + "logps/rejected": -261.10540771484375, + "loss": 0.6891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0024665123783051968, + "rewards/margins": 0.07824081182479858, + "rewards/rejected": -0.07577430456876755, + "step": 8930 + }, + { + "epoch": 0.58, + "learning_rate": 2.196129531934956e-06, + "logits/chosen": -2.2389838695526123, + "logits/rejected": -1.970949411392212, + "logps/chosen": -235.01522827148438, + "logps/rejected": -226.91268920898438, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02898770570755005, + "rewards/margins": 0.09960027039051056, + "rewards/rejected": -0.07061255723237991, + "step": 8940 + }, + { + "epoch": 0.59, + "learning_rate": 2.190462755066902e-06, + "logits/chosen": -2.251969814300537, + "logits/rejected": -2.020610809326172, + "logps/chosen": -263.11944580078125, + "logps/rejected": -244.75009155273438, + "loss": 0.6924, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.007182478904724121, + "rewards/margins": 0.07121424376964569, + "rewards/rejected": -0.07839672267436981, + "step": 8950 + }, + { + "epoch": 0.59, + "learning_rate": 2.184797592896746e-06, + "logits/chosen": -2.379193067550659, + "logits/rejected": -2.3389806747436523, + "logps/chosen": -233.12423706054688, + "logps/rejected": -215.5608673095703, + "loss": 0.6887, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.01311335526406765, + "rewards/margins": 0.08969788253307343, + "rewards/rejected": -0.07658452540636063, + "step": 8960 + }, + { + "epoch": 0.59, + "learning_rate": 2.17913407497675e-06, + "logits/chosen": -2.313098430633545, + "logits/rejected": -2.381880283355713, + "logps/chosen": -176.4674072265625, + "logps/rejected": -218.6103973388672, + "loss": 0.6915, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.019145773723721504, + "rewards/margins": 0.0819178968667984, + "rewards/rejected": -0.06277212500572205, + "step": 8970 + }, + { + "epoch": 0.59, + "learning_rate": 2.173472230850596e-06, + "logits/chosen": -2.4210402965545654, + "logits/rejected": -2.2281031608581543, + "logps/chosen": -201.3446044921875, + "logps/rejected": -165.3816680908203, + "loss": 0.6918, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.010820412077009678, + "rewards/margins": 0.04480786249041557, + "rewards/rejected": -0.05562828108668327, + "step": 8980 + }, + { + "epoch": 0.59, + "learning_rate": 2.1678120900532375e-06, + "logits/chosen": -2.4019229412078857, + "logits/rejected": -2.1054940223693848, + "logps/chosen": -235.7376251220703, + "logps/rejected": -216.53738403320312, + "loss": 0.691, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0017721873009577394, + "rewards/margins": 0.10060055553913116, + "rewards/rejected": -0.09882837533950806, + "step": 8990 + }, + { + "epoch": 0.59, + "learning_rate": 2.1621536821107412e-06, + "logits/chosen": -2.2904767990112305, + "logits/rejected": -2.159829616546631, + "logps/chosen": -201.26571655273438, + "logps/rejected": -169.5950164794922, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013041026890277863, + "rewards/margins": 0.103615902364254, + "rewards/rejected": -0.09057489037513733, + "step": 9000 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -2.3073229789733887, + "eval_logits/rejected": -2.1196234226226807, + "eval_logps/chosen": -230.892333984375, + "eval_logps/rejected": -219.00497436523438, + "eval_loss": 0.6897886991500854, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": 0.011126398108899593, + "eval_rewards/margins": 0.08505717664957047, + "eval_rewards/rejected": -0.0739307776093483, + "eval_runtime": 715.638, + "eval_samples_per_second": 2.795, + "eval_steps_per_second": 1.397, + "step": 9000 + }, + { + "epoch": 0.59, + "learning_rate": 2.1564970365401346e-06, + "logits/chosen": -2.342783212661743, + "logits/rejected": -2.0789589881896973, + "logps/chosen": -183.83070373535156, + "logps/rejected": -154.10012817382812, + "loss": 0.6886, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0032037317287176847, + "rewards/margins": 0.08277516812086105, + "rewards/rejected": -0.0795714408159256, + "step": 9010 + }, + { + "epoch": 0.59, + "learning_rate": 2.1508421828492527e-06, + "logits/chosen": -2.4671521186828613, + "logits/rejected": -2.1197142601013184, + "logps/chosen": -222.3729705810547, + "logps/rejected": -161.41038513183594, + "loss": 0.6922, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02819722518324852, + "rewards/margins": 0.07277707755565643, + "rewards/rejected": -0.04457986354827881, + "step": 9020 + }, + { + "epoch": 0.59, + "learning_rate": 2.145189150536582e-06, + "logits/chosen": -2.1406853199005127, + "logits/rejected": -2.043962001800537, + "logps/chosen": -219.89431762695312, + "logps/rejected": -176.14846801757812, + "loss": 0.6914, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02275204285979271, + "rewards/margins": 0.05656403303146362, + "rewards/rejected": -0.03381199389696121, + "step": 9030 + }, + { + "epoch": 0.59, + "learning_rate": 2.139537969091107e-06, + "logits/chosen": -2.1763834953308105, + "logits/rejected": -2.141878843307495, + "logps/chosen": -265.5818786621094, + "logps/rejected": -207.55709838867188, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 1.7970241970033385e-05, + "rewards/margins": 0.0414576455950737, + "rewards/rejected": -0.04143967479467392, + "step": 9040 + }, + { + "epoch": 0.59, + "learning_rate": 2.1338886679921603e-06, + "logits/chosen": -2.2526087760925293, + "logits/rejected": -2.184154748916626, + "logps/chosen": -240.57388305664062, + "logps/rejected": -225.2537841796875, + "loss": 0.6912, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.009691650979220867, + "rewards/margins": 0.05090288445353508, + "rewards/rejected": -0.041211239993572235, + "step": 9050 + }, + { + "epoch": 0.59, + "learning_rate": 2.128241276709263e-06, + "logits/chosen": -2.3358287811279297, + "logits/rejected": -2.2810444831848145, + "logps/chosen": -199.28738403320312, + "logps/rejected": -226.06655883789062, + "loss": 0.6915, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.039743922650814056, + "rewards/margins": 0.06741781532764435, + "rewards/rejected": -0.027673888951539993, + "step": 9060 + }, + { + "epoch": 0.59, + "learning_rate": 2.1225958247019746e-06, + "logits/chosen": -2.3715434074401855, + "logits/rejected": -2.5027832984924316, + "logps/chosen": -184.3772430419922, + "logps/rejected": -209.1215362548828, + "loss": 0.6913, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.027645772323012352, + "rewards/margins": 0.046046603471040726, + "rewards/rejected": -0.018400834873318672, + "step": 9070 + }, + { + "epoch": 0.59, + "learning_rate": 2.1169523414197383e-06, + "logits/chosen": -2.1513938903808594, + "logits/rejected": -2.152141571044922, + "logps/chosen": -196.04464721679688, + "logps/rejected": -224.1463165283203, + "loss": 0.6907, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.027610694989562035, + "rewards/margins": 0.04780023545026779, + "rewards/rejected": -0.020189542323350906, + "step": 9080 + }, + { + "epoch": 0.59, + "learning_rate": 2.1113108563017267e-06, + "logits/chosen": -2.248032808303833, + "logits/rejected": -2.033977746963501, + "logps/chosen": -212.1124725341797, + "logps/rejected": -188.57859802246094, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006707633379846811, + "rewards/margins": 0.07095328718423843, + "rewards/rejected": -0.06424565613269806, + "step": 9090 + }, + { + "epoch": 0.6, + "learning_rate": 2.1056713987766905e-06, + "logits/chosen": -2.443134307861328, + "logits/rejected": -2.1070022583007812, + "logps/chosen": -212.5425262451172, + "logps/rejected": -172.75057983398438, + "loss": 0.6905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.02657800354063511, + "rewards/margins": 0.08923501521348953, + "rewards/rejected": -0.06265701353549957, + "step": 9100 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.312913179397583, + "eval_logits/rejected": -2.1251325607299805, + "eval_logps/chosen": -229.99118041992188, + "eval_logps/rejected": -216.90176391601562, + "eval_loss": 0.6898962259292603, + "eval_rewards/accuracies": 0.6324999928474426, + "eval_rewards/chosen": 0.02013748697936535, + "eval_rewards/margins": 0.07303596287965775, + "eval_rewards/rejected": -0.05289847403764725, + "eval_runtime": 710.189, + "eval_samples_per_second": 2.816, + "eval_steps_per_second": 1.408, + "step": 9100 + }, + { + "epoch": 0.6, + "learning_rate": 2.1000339982628022e-06, + "logits/chosen": -2.1159980297088623, + "logits/rejected": -2.1975278854370117, + "logps/chosen": -249.4115447998047, + "logps/rejected": -221.4599609375, + "loss": 0.6895, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.01861291006207466, + "rewards/margins": 0.06368504464626312, + "rewards/rejected": -0.04507213830947876, + "step": 9110 + }, + { + "epoch": 0.6, + "learning_rate": 2.0943986841675043e-06, + "logits/chosen": -2.3297810554504395, + "logits/rejected": -2.0988831520080566, + "logps/chosen": -199.1885986328125, + "logps/rejected": -190.6521453857422, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04022395610809326, + "rewards/margins": 0.10803340375423431, + "rewards/rejected": -0.06780944764614105, + "step": 9120 + }, + { + "epoch": 0.6, + "learning_rate": 2.088765485887356e-06, + "logits/chosen": -2.3123505115509033, + "logits/rejected": -2.110137939453125, + "logps/chosen": -242.97427368164062, + "logps/rejected": -205.7440185546875, + "loss": 0.6919, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.01740623079240322, + "rewards/margins": 0.04395180940628052, + "rewards/rejected": -0.026545578613877296, + "step": 9130 + }, + { + "epoch": 0.6, + "learning_rate": 2.083134432807879e-06, + "logits/chosen": -2.2417685985565186, + "logits/rejected": -2.172234058380127, + "logps/chosen": -193.64578247070312, + "logps/rejected": -223.5891876220703, + "loss": 0.6891, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006263398565351963, + "rewards/margins": 0.08425328880548477, + "rewards/rejected": -0.07798988372087479, + "step": 9140 + }, + { + "epoch": 0.6, + "learning_rate": 2.077505554303404e-06, + "logits/chosen": -2.3099794387817383, + "logits/rejected": -2.274794816970825, + "logps/chosen": -169.19174194335938, + "logps/rejected": -179.0936279296875, + "loss": 0.6903, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.053602445870637894, + "rewards/margins": 0.05535256117582321, + "rewards/rejected": -0.0017501137917861342, + "step": 9150 + }, + { + "epoch": 0.6, + "learning_rate": 2.071878879736918e-06, + "logits/chosen": -2.3148703575134277, + "logits/rejected": -2.1224913597106934, + "logps/chosen": -245.26870727539062, + "logps/rejected": -331.78924560546875, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01570909097790718, + "rewards/margins": 0.08236773312091827, + "rewards/rejected": -0.06665865331888199, + "step": 9160 + }, + { + "epoch": 0.6, + "learning_rate": 2.0662544384599136e-06, + "logits/chosen": -2.2138946056365967, + "logits/rejected": -2.138765335083008, + "logps/chosen": -197.99423217773438, + "logps/rejected": -190.30654907226562, + "loss": 0.6893, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04112407937645912, + "rewards/margins": 0.08797650039196014, + "rewards/rejected": -0.046852417290210724, + "step": 9170 + }, + { + "epoch": 0.6, + "learning_rate": 2.0606322598122314e-06, + "logits/chosen": -2.2186291217803955, + "logits/rejected": -2.3340086936950684, + "logps/chosen": -185.21937561035156, + "logps/rejected": -208.082763671875, + "loss": 0.6924, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.012941636145114899, + "rewards/margins": 0.03202500194311142, + "rewards/rejected": -0.019083363935351372, + "step": 9180 + }, + { + "epoch": 0.6, + "learning_rate": 2.0550123731219085e-06, + "logits/chosen": -2.4984421730041504, + "logits/rejected": -2.322842836380005, + "logps/chosen": -249.52920532226562, + "logps/rejected": -215.18814086914062, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.036506522446870804, + "rewards/margins": 0.06803809106349945, + "rewards/rejected": -0.03153156489133835, + "step": 9190 + }, + { + "epoch": 0.6, + "learning_rate": 2.0493948077050267e-06, + "logits/chosen": -2.162285327911377, + "logits/rejected": -1.980200171470642, + "logps/chosen": -190.65975952148438, + "logps/rejected": -181.0004425048828, + "loss": 0.6887, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.025962283834815025, + "rewards/margins": 0.08430268615484238, + "rewards/rejected": -0.0583404079079628, + "step": 9200 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.328345537185669, + "eval_logits/rejected": -2.139697551727295, + "eval_logps/chosen": -229.9346923828125, + "eval_logps/rejected": -217.44418334960938, + "eval_loss": 0.6898381114006042, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": 0.020702635869383812, + "eval_rewards/margins": 0.07902555167675018, + "eval_rewards/rejected": -0.05832291021943092, + "eval_runtime": 713.3551, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 9200 + }, + { + "epoch": 0.6, + "learning_rate": 2.0437795928655596e-06, + "logits/chosen": -2.351074457168579, + "logits/rejected": -2.3267416954040527, + "logps/chosen": -279.7415466308594, + "logps/rejected": -267.31201171875, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.019710825756192207, + "rewards/margins": 0.05754191428422928, + "rewards/rejected": -0.03783109039068222, + "step": 9210 + }, + { + "epoch": 0.6, + "learning_rate": 2.0381667578952184e-06, + "logits/chosen": -2.4281165599823, + "logits/rejected": -2.208249568939209, + "logps/chosen": -209.62942504882812, + "logps/rejected": -225.8242645263672, + "loss": 0.6882, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.012434298172593117, + "rewards/margins": 0.10577799379825592, + "rewards/rejected": -0.09334369748830795, + "step": 9220 + }, + { + "epoch": 0.6, + "learning_rate": 2.0325563320732995e-06, + "logits/chosen": -2.4986178874969482, + "logits/rejected": -2.109630584716797, + "logps/chosen": -262.3493347167969, + "logps/rejected": -227.225341796875, + "loss": 0.6905, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.009889942593872547, + "rewards/margins": 0.08893869817256927, + "rewards/rejected": -0.0790487602353096, + "step": 9230 + }, + { + "epoch": 0.6, + "learning_rate": 2.026948344666532e-06, + "logits/chosen": -2.2422378063201904, + "logits/rejected": -2.198185443878174, + "logps/chosen": -199.4209747314453, + "logps/rejected": -207.3389434814453, + "loss": 0.6888, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.002950438065454364, + "rewards/margins": 0.083625927567482, + "rewards/rejected": -0.08657635748386383, + "step": 9240 + }, + { + "epoch": 0.61, + "learning_rate": 2.0213428249289257e-06, + "logits/chosen": -2.2211735248565674, + "logits/rejected": -2.1415927410125732, + "logps/chosen": -196.4319610595703, + "logps/rejected": -207.9313507080078, + "loss": 0.6879, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.021085821092128754, + "rewards/margins": 0.09691883623600006, + "rewards/rejected": -0.0758330225944519, + "step": 9250 + }, + { + "epoch": 0.61, + "learning_rate": 2.0157398021016175e-06, + "logits/chosen": -2.2351865768432617, + "logits/rejected": -2.1974527835845947, + "logps/chosen": -144.62660217285156, + "logps/rejected": -199.48924255371094, + "loss": 0.691, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02908751741051674, + "rewards/margins": 0.08657882362604141, + "rewards/rejected": -0.05749132111668587, + "step": 9260 + }, + { + "epoch": 0.61, + "learning_rate": 2.010139305412719e-06, + "logits/chosen": -2.506202220916748, + "logits/rejected": -2.2831943035125732, + "logps/chosen": -275.13079833984375, + "logps/rejected": -239.76907348632812, + "loss": 0.6913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.014648482203483582, + "rewards/margins": 0.07002347707748413, + "rewards/rejected": -0.05537499859929085, + "step": 9270 + }, + { + "epoch": 0.61, + "learning_rate": 2.0045413640771644e-06, + "logits/chosen": -2.2399230003356934, + "logits/rejected": -2.382997989654541, + "logps/chosen": -254.0187530517578, + "logps/rejected": -266.322021484375, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01146581582725048, + "rewards/margins": 0.09047718346118927, + "rewards/rejected": -0.07901137322187424, + "step": 9280 + }, + { + "epoch": 0.61, + "learning_rate": 1.998946007296558e-06, + "logits/chosen": -2.476470708847046, + "logits/rejected": -2.1356618404388428, + "logps/chosen": -306.3233947753906, + "logps/rejected": -251.1251678466797, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02277219668030739, + "rewards/margins": 0.0823870599269867, + "rewards/rejected": -0.059614866971969604, + "step": 9290 + }, + { + "epoch": 0.61, + "learning_rate": 1.9933532642590215e-06, + "logits/chosen": -2.212050676345825, + "logits/rejected": -1.8353458642959595, + "logps/chosen": -184.3854522705078, + "logps/rejected": -143.99754333496094, + "loss": 0.6899, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04772832244634628, + "rewards/margins": 0.09772919118404388, + "rewards/rejected": -0.0500008650124073, + "step": 9300 + }, + { + "epoch": 0.61, + "eval_logits/chosen": -2.333347797393799, + "eval_logits/rejected": -2.1441400051116943, + "eval_logps/chosen": -231.38299560546875, + "eval_logps/rejected": -219.56932067871094, + "eval_loss": 0.689826488494873, + "eval_rewards/accuracies": 0.637499988079071, + "eval_rewards/chosen": 0.006219496019184589, + "eval_rewards/margins": 0.08579385280609131, + "eval_rewards/rejected": -0.079574353992939, + "eval_runtime": 709.6682, + "eval_samples_per_second": 2.818, + "eval_steps_per_second": 1.409, + "step": 9300 + }, + { + "epoch": 0.61, + "learning_rate": 1.987763164139042e-06, + "logits/chosen": -2.3631813526153564, + "logits/rejected": -2.159517288208008, + "logps/chosen": -204.81552124023438, + "logps/rejected": -214.0128631591797, + "loss": 0.6898, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.004395435098558664, + "rewards/margins": 0.06943001598119736, + "rewards/rejected": -0.06503458321094513, + "step": 9310 + }, + { + "epoch": 0.61, + "learning_rate": 1.982175736097321e-06, + "logits/chosen": -2.047428846359253, + "logits/rejected": -2.0620810985565186, + "logps/chosen": -275.38446044921875, + "logps/rejected": -301.35516357421875, + "loss": 0.6902, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.008434224873781204, + "rewards/margins": 0.08950887620449066, + "rewards/rejected": -0.09794311225414276, + "step": 9320 + }, + { + "epoch": 0.61, + "learning_rate": 1.9765910092806196e-06, + "logits/chosen": -2.2493948936462402, + "logits/rejected": -2.153007984161377, + "logps/chosen": -177.16024780273438, + "logps/rejected": -163.21905517578125, + "loss": 0.6901, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.009152812883257866, + "rewards/margins": 0.05717051774263382, + "rewards/rejected": -0.048017702996730804, + "step": 9330 + }, + { + "epoch": 0.61, + "learning_rate": 1.9710090128216083e-06, + "logits/chosen": -2.3084399700164795, + "logits/rejected": -2.213273525238037, + "logps/chosen": -224.1075897216797, + "logps/rejected": -218.4535675048828, + "loss": 0.6883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.013775345869362354, + "rewards/margins": 0.10509101301431656, + "rewards/rejected": -0.11886636167764664, + "step": 9340 + }, + { + "epoch": 0.61, + "learning_rate": 1.9654297758387155e-06, + "logits/chosen": -2.1536035537719727, + "logits/rejected": -2.131392002105713, + "logps/chosen": -163.9993133544922, + "logps/rejected": -192.57957458496094, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.031292807310819626, + "rewards/margins": 0.07854814827442169, + "rewards/rejected": -0.10984095185995102, + "step": 9350 + }, + { + "epoch": 0.61, + "learning_rate": 1.9598533274359736e-06, + "logits/chosen": -2.3260645866394043, + "logits/rejected": -2.233445882797241, + "logps/chosen": -243.11318969726562, + "logps/rejected": -253.74349975585938, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03432890772819519, + "rewards/margins": 0.03268744423985481, + "rewards/rejected": -0.0670163482427597, + "step": 9360 + }, + { + "epoch": 0.61, + "learning_rate": 1.9542796967028697e-06, + "logits/chosen": -2.334592819213867, + "logits/rejected": -2.218923330307007, + "logps/chosen": -219.28659057617188, + "logps/rejected": -207.3817596435547, + "loss": 0.6917, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.01046024076640606, + "rewards/margins": 0.0633801594376564, + "rewards/rejected": -0.07384039461612701, + "step": 9370 + }, + { + "epoch": 0.61, + "learning_rate": 1.948708912714192e-06, + "logits/chosen": -2.243697166442871, + "logits/rejected": -2.0024325847625732, + "logps/chosen": -252.65280151367188, + "logps/rejected": -229.02334594726562, + "loss": 0.692, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05707541108131409, + "rewards/margins": 0.07306285202503204, + "rewards/rejected": -0.13013826310634613, + "step": 9380 + }, + { + "epoch": 0.61, + "learning_rate": 1.9431410045298786e-06, + "logits/chosen": -2.0937747955322266, + "logits/rejected": -2.023585796356201, + "logps/chosen": -219.1824951171875, + "logps/rejected": -225.77780151367188, + "loss": 0.6898, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.020571475848555565, + "rewards/margins": 0.07571020722389221, + "rewards/rejected": -0.09628168493509293, + "step": 9390 + }, + { + "epoch": 0.62, + "learning_rate": 1.9375760011948654e-06, + "logits/chosen": -2.4148128032684326, + "logits/rejected": -2.2704997062683105, + "logps/chosen": -202.109130859375, + "logps/rejected": -234.7781982421875, + "loss": 0.6884, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.024311980232596397, + "rewards/margins": 0.10348609834909439, + "rewards/rejected": -0.12779806554317474, + "step": 9400 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.3321480751037598, + "eval_logits/rejected": -2.1431541442871094, + "eval_logps/chosen": -234.85800170898438, + "eval_logps/rejected": -222.5006866455078, + "eval_loss": 0.6898515224456787, + "eval_rewards/accuracies": 0.6334999799728394, + "eval_rewards/chosen": -0.028530515730381012, + "eval_rewards/margins": 0.08035717159509659, + "eval_rewards/rejected": -0.1088876873254776, + "eval_runtime": 710.5474, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.407, + "step": 9400 + }, + { + "epoch": 0.62, + "learning_rate": 1.932013931738937e-06, + "logits/chosen": -2.310518741607666, + "logits/rejected": -2.0845718383789062, + "logps/chosen": -207.1325225830078, + "logps/rejected": -232.1420135498047, + "loss": 0.6856, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.03530799597501755, + "rewards/margins": 0.12127707153558731, + "rewards/rejected": -0.15658505260944366, + "step": 9410 + }, + { + "epoch": 0.62, + "learning_rate": 1.9264548251765717e-06, + "logits/chosen": -2.426779270172119, + "logits/rejected": -2.238455295562744, + "logps/chosen": -205.3507080078125, + "logps/rejected": -209.31491088867188, + "loss": 0.6911, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.015480468980967999, + "rewards/margins": 0.07075979560613632, + "rewards/rejected": -0.0862402692437172, + "step": 9420 + }, + { + "epoch": 0.62, + "learning_rate": 1.9208987105067924e-06, + "logits/chosen": -2.2212510108947754, + "logits/rejected": -2.0826263427734375, + "logps/chosen": -216.4222412109375, + "logps/rejected": -198.42672729492188, + "loss": 0.6914, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01865190640091896, + "rewards/margins": 0.058921001851558685, + "rewards/rejected": -0.07757291197776794, + "step": 9430 + }, + { + "epoch": 0.62, + "learning_rate": 1.9153456167130154e-06, + "logits/chosen": -2.3300156593322754, + "logits/rejected": -2.324368715286255, + "logps/chosen": -206.3992156982422, + "logps/rejected": -240.15884399414062, + "loss": 0.6907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023285966366529465, + "rewards/margins": 0.0756094753742218, + "rewards/rejected": -0.09889544546604156, + "step": 9440 + }, + { + "epoch": 0.62, + "learning_rate": 1.9097955727628975e-06, + "logits/chosen": -2.3564929962158203, + "logits/rejected": -2.353801965713501, + "logps/chosen": -196.1259765625, + "logps/rejected": -216.0775604248047, + "loss": 0.6901, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0014045886928215623, + "rewards/margins": 0.0801662728190422, + "rewards/rejected": -0.08157085627317429, + "step": 9450 + }, + { + "epoch": 0.62, + "learning_rate": 1.904248607608187e-06, + "logits/chosen": -2.2641825675964355, + "logits/rejected": -2.2951583862304688, + "logps/chosen": -257.591552734375, + "logps/rejected": -216.25, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.003715710248798132, + "rewards/margins": 0.05765196681022644, + "rewards/rejected": -0.06136767938733101, + "step": 9460 + }, + { + "epoch": 0.62, + "learning_rate": 1.8987047501845714e-06, + "logits/chosen": -2.3341257572174072, + "logits/rejected": -2.302320957183838, + "logps/chosen": -166.62255859375, + "logps/rejected": -169.2261199951172, + "loss": 0.6898, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0030190914403647184, + "rewards/margins": 0.08849085867404938, + "rewards/rejected": -0.08547177165746689, + "step": 9470 + }, + { + "epoch": 0.62, + "learning_rate": 1.8931640294115267e-06, + "logits/chosen": -2.1365644931793213, + "logits/rejected": -2.0182435512542725, + "logps/chosen": -193.88600158691406, + "logps/rejected": -189.7248992919922, + "loss": 0.6896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0011992065701633692, + "rewards/margins": 0.11369111388921738, + "rewards/rejected": -0.11249189078807831, + "step": 9480 + }, + { + "epoch": 0.62, + "learning_rate": 1.8876264741921662e-06, + "logits/chosen": -2.102898120880127, + "logits/rejected": -2.123107433319092, + "logps/chosen": -189.058837890625, + "logps/rejected": -195.1671142578125, + "loss": 0.6866, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.01066638994961977, + "rewards/margins": 0.11543774604797363, + "rewards/rejected": -0.10477133840322495, + "step": 9490 + }, + { + "epoch": 0.62, + "learning_rate": 1.8820921134130912e-06, + "logits/chosen": -2.3311290740966797, + "logits/rejected": -1.958857774734497, + "logps/chosen": -230.97128295898438, + "logps/rejected": -198.6006317138672, + "loss": 0.6871, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004591288510710001, + "rewards/margins": 0.12538619339466095, + "rewards/rejected": -0.1299774944782257, + "step": 9500 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.332388162612915, + "eval_logits/rejected": -2.143458604812622, + "eval_logps/chosen": -232.95985412597656, + "eval_logps/rejected": -220.78396606445312, + "eval_loss": 0.6898452639579773, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": -0.009548979811370373, + "eval_rewards/margins": 0.08217175304889679, + "eval_rewards/rejected": -0.09172075241804123, + "eval_runtime": 713.2212, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 9500 + }, + { + "epoch": 0.62, + "learning_rate": 1.8765609759442378e-06, + "logits/chosen": -2.1907570362091064, + "logits/rejected": -2.0811009407043457, + "logps/chosen": -242.083251953125, + "logps/rejected": -238.7690887451172, + "loss": 0.6899, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0076196156442165375, + "rewards/margins": 0.07497996836900711, + "rewards/rejected": -0.08259958773851395, + "step": 9510 + }, + { + "epoch": 0.62, + "learning_rate": 1.8710330906387288e-06, + "logits/chosen": -2.3943378925323486, + "logits/rejected": -2.34335994720459, + "logps/chosen": -238.8350372314453, + "logps/rejected": -271.54937744140625, + "loss": 0.6896, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.003474016208201647, + "rewards/margins": 0.09124413132667542, + "rewards/rejected": -0.09471814334392548, + "step": 9520 + }, + { + "epoch": 0.62, + "learning_rate": 1.8655084863327222e-06, + "logits/chosen": -2.304542303085327, + "logits/rejected": -2.324296474456787, + "logps/chosen": -183.95468139648438, + "logps/rejected": -195.6629180908203, + "loss": 0.6922, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.004966600798070431, + "rewards/margins": 0.055461425334215164, + "rewards/rejected": -0.060428015887737274, + "step": 9530 + }, + { + "epoch": 0.62, + "learning_rate": 1.8599871918452603e-06, + "logits/chosen": -2.1602072715759277, + "logits/rejected": -2.1528382301330566, + "logps/chosen": -221.7826690673828, + "logps/rejected": -245.52969360351562, + "loss": 0.6904, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.01309148408472538, + "rewards/margins": 0.10497549921274185, + "rewards/rejected": -0.09188400954008102, + "step": 9540 + }, + { + "epoch": 0.62, + "learning_rate": 1.8544692359781192e-06, + "logits/chosen": -2.3558590412139893, + "logits/rejected": -2.1197030544281006, + "logps/chosen": -186.4560546875, + "logps/rejected": -162.83966064453125, + "loss": 0.6908, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.011286085471510887, + "rewards/margins": 0.07330868393182755, + "rewards/rejected": -0.062022604048252106, + "step": 9550 + }, + { + "epoch": 0.63, + "learning_rate": 1.8489546475156602e-06, + "logits/chosen": -2.5130443572998047, + "logits/rejected": -2.286351442337036, + "logps/chosen": -218.3134765625, + "logps/rejected": -207.4962615966797, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011793679557740688, + "rewards/margins": 0.0735076516866684, + "rewards/rejected": -0.06171398237347603, + "step": 9560 + }, + { + "epoch": 0.63, + "learning_rate": 1.8434434552246778e-06, + "logits/chosen": -2.1478981971740723, + "logits/rejected": -2.0774495601654053, + "logps/chosen": -210.42529296875, + "logps/rejected": -207.1792755126953, + "loss": 0.6895, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.007937637157738209, + "rewards/margins": 0.08016934990882874, + "rewards/rejected": -0.08810698240995407, + "step": 9570 + }, + { + "epoch": 0.63, + "learning_rate": 1.837935687854251e-06, + "logits/chosen": -2.3582987785339355, + "logits/rejected": -2.111501693725586, + "logps/chosen": -219.9694366455078, + "logps/rejected": -197.4185333251953, + "loss": 0.6883, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.015772990882396698, + "rewards/margins": 0.09249218553304672, + "rewards/rejected": -0.07671918720006943, + "step": 9580 + }, + { + "epoch": 0.63, + "learning_rate": 1.832431374135592e-06, + "logits/chosen": -2.4764034748077393, + "logits/rejected": -2.07975697517395, + "logps/chosen": -249.00634765625, + "logps/rejected": -241.4837188720703, + "loss": 0.687, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.005709916818886995, + "rewards/margins": 0.11350512504577637, + "rewards/rejected": -0.11921503394842148, + "step": 9590 + }, + { + "epoch": 0.63, + "learning_rate": 1.8269305427818977e-06, + "logits/chosen": -2.4727559089660645, + "logits/rejected": -2.327918529510498, + "logps/chosen": -208.00125122070312, + "logps/rejected": -186.8267059326172, + "loss": 0.6905, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.006150106433779001, + "rewards/margins": 0.06489264965057373, + "rewards/rejected": -0.05874254181981087, + "step": 9600 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -2.3416812419891357, + "eval_logits/rejected": -2.1520164012908936, + "eval_logps/chosen": -229.97618103027344, + "eval_logps/rejected": -218.22509765625, + "eval_loss": 0.6898518204689026, + "eval_rewards/accuracies": 0.6384999752044678, + "eval_rewards/chosen": 0.02028742991387844, + "eval_rewards/margins": 0.08641922473907471, + "eval_rewards/rejected": -0.06613180041313171, + "eval_runtime": 712.8227, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 9600 + }, + { + "epoch": 0.63, + "learning_rate": 1.821433222488199e-06, + "logits/chosen": -2.339639663696289, + "logits/rejected": -2.0263071060180664, + "logps/chosen": -221.83694458007812, + "logps/rejected": -203.28822326660156, + "loss": 0.6891, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.037778157740831375, + "rewards/margins": 0.09198880195617676, + "rewards/rejected": -0.05421064421534538, + "step": 9610 + }, + { + "epoch": 0.63, + "learning_rate": 1.8159394419312112e-06, + "logits/chosen": -2.377436399459839, + "logits/rejected": -2.1873252391815186, + "logps/chosen": -256.22357177734375, + "logps/rejected": -222.9322967529297, + "loss": 0.6871, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025064552202820778, + "rewards/margins": 0.13247425854206085, + "rewards/rejected": -0.10740969330072403, + "step": 9620 + }, + { + "epoch": 0.63, + "learning_rate": 1.8104492297691845e-06, + "logits/chosen": -2.3256943225860596, + "logits/rejected": -2.104449510574341, + "logps/chosen": -228.5015869140625, + "logps/rejected": -213.09912109375, + "loss": 0.692, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.019784385338425636, + "rewards/margins": 0.08710122108459473, + "rewards/rejected": -0.10688559710979462, + "step": 9630 + }, + { + "epoch": 0.63, + "learning_rate": 1.8049626146417562e-06, + "logits/chosen": -2.1221089363098145, + "logits/rejected": -1.9787824153900146, + "logps/chosen": -155.16111755371094, + "logps/rejected": -163.43637084960938, + "loss": 0.6907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0008523104479536414, + "rewards/margins": 0.11988194286823273, + "rewards/rejected": -0.12073423713445663, + "step": 9640 + }, + { + "epoch": 0.63, + "learning_rate": 1.7994796251697983e-06, + "logits/chosen": -2.2313790321350098, + "logits/rejected": -2.09123158454895, + "logps/chosen": -196.5417022705078, + "logps/rejected": -248.27761840820312, + "loss": 0.6885, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04502807930111885, + "rewards/margins": 0.12171129137277603, + "rewards/rejected": -0.166739359498024, + "step": 9650 + }, + { + "epoch": 0.63, + "learning_rate": 1.794000289955269e-06, + "logits/chosen": -2.2783684730529785, + "logits/rejected": -2.106417417526245, + "logps/chosen": -269.64556884765625, + "logps/rejected": -246.83901977539062, + "loss": 0.6888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.025853093713521957, + "rewards/margins": 0.08640275150537491, + "rewards/rejected": -0.11225583404302597, + "step": 9660 + }, + { + "epoch": 0.63, + "learning_rate": 1.7885246375810646e-06, + "logits/chosen": -2.2515721321105957, + "logits/rejected": -1.9777309894561768, + "logps/chosen": -224.28076171875, + "logps/rejected": -228.09130859375, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.888061761856079e-05, + "rewards/margins": 0.07572519034147263, + "rewards/rejected": -0.07579407840967178, + "step": 9670 + }, + { + "epoch": 0.63, + "learning_rate": 1.7830526966108713e-06, + "logits/chosen": -2.1138720512390137, + "logits/rejected": -1.9187238216400146, + "logps/chosen": -189.40728759765625, + "logps/rejected": -175.88150024414062, + "loss": 0.6841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.029343629255890846, + "rewards/margins": 0.15085718035697937, + "rewards/rejected": -0.18020080029964447, + "step": 9680 + }, + { + "epoch": 0.63, + "learning_rate": 1.7775844955890129e-06, + "logits/chosen": -2.271742343902588, + "logits/rejected": -2.1086041927337646, + "logps/chosen": -210.3048095703125, + "logps/rejected": -209.052978515625, + "loss": 0.6887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.012415561825037003, + "rewards/margins": 0.103248730301857, + "rewards/rejected": -0.0908331573009491, + "step": 9690 + }, + { + "epoch": 0.63, + "learning_rate": 1.7721200630403046e-06, + "logits/chosen": -2.346208095550537, + "logits/rejected": -2.157820224761963, + "logps/chosen": -195.90628051757812, + "logps/rejected": -228.021240234375, + "loss": 0.6895, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0014390780124813318, + "rewards/margins": 0.06569032371044159, + "rewards/rejected": -0.06712940335273743, + "step": 9700 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -2.3423051834106445, + "eval_logits/rejected": -2.1527085304260254, + "eval_logps/chosen": -231.52012634277344, + "eval_logps/rejected": -219.43946838378906, + "eval_loss": 0.6898437142372131, + "eval_rewards/accuracies": 0.6439999938011169, + "eval_rewards/chosen": 0.004848138429224491, + "eval_rewards/margins": 0.08312396705150604, + "eval_rewards/rejected": -0.07827582955360413, + "eval_runtime": 710.5463, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.407, + "step": 9700 + }, + { + "epoch": 0.64, + "learning_rate": 1.7666594274699037e-06, + "logits/chosen": -2.264838457107544, + "logits/rejected": -2.1133649349212646, + "logps/chosen": -250.9988555908203, + "logps/rejected": -223.15579223632812, + "loss": 0.6889, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0169143658131361, + "rewards/margins": 0.10361073911190033, + "rewards/rejected": -0.08669638633728027, + "step": 9710 + }, + { + "epoch": 0.64, + "learning_rate": 1.76120261736316e-06, + "logits/chosen": -2.314434051513672, + "logits/rejected": -1.943964958190918, + "logps/chosen": -227.54330444335938, + "logps/rejected": -212.75125122070312, + "loss": 0.6871, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00828264094889164, + "rewards/margins": 0.12351206690073013, + "rewards/rejected": -0.13179472088813782, + "step": 9720 + }, + { + "epoch": 0.64, + "learning_rate": 1.755749661185468e-06, + "logits/chosen": -2.382049322128296, + "logits/rejected": -1.948427438735962, + "logps/chosen": -290.7059631347656, + "logps/rejected": -242.9027557373047, + "loss": 0.6899, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03743572160601616, + "rewards/margins": 0.1103050485253334, + "rewards/rejected": -0.07286933809518814, + "step": 9730 + }, + { + "epoch": 0.64, + "learning_rate": 1.7503005873821183e-06, + "logits/chosen": -2.320734739303589, + "logits/rejected": -2.263913631439209, + "logps/chosen": -158.90725708007812, + "logps/rejected": -193.49929809570312, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015210744924843311, + "rewards/margins": 0.08328135311603546, + "rewards/rejected": -0.06807061284780502, + "step": 9740 + }, + { + "epoch": 0.64, + "learning_rate": 1.744855424378148e-06, + "logits/chosen": -2.1784415245056152, + "logits/rejected": -2.192660093307495, + "logps/chosen": -186.56448364257812, + "logps/rejected": -220.04287719726562, + "loss": 0.6906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02062476985156536, + "rewards/margins": 0.10032866150140762, + "rewards/rejected": -0.07970388978719711, + "step": 9750 + }, + { + "epoch": 0.64, + "learning_rate": 1.7394142005781973e-06, + "logits/chosen": -2.121195077896118, + "logits/rejected": -2.2074551582336426, + "logps/chosen": -260.9930114746094, + "logps/rejected": -271.81256103515625, + "loss": 0.6932, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.007180415093898773, + "rewards/margins": 0.07248598337173462, + "rewards/rejected": -0.06530557572841644, + "step": 9760 + }, + { + "epoch": 0.64, + "learning_rate": 1.7339769443663528e-06, + "logits/chosen": -2.3068153858184814, + "logits/rejected": -2.1621522903442383, + "logps/chosen": -142.29673767089844, + "logps/rejected": -153.0177764892578, + "loss": 0.6874, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.006454641930758953, + "rewards/margins": 0.10017760843038559, + "rewards/rejected": -0.09372296929359436, + "step": 9770 + }, + { + "epoch": 0.64, + "learning_rate": 1.7285436841060078e-06, + "logits/chosen": -2.4975733757019043, + "logits/rejected": -2.209691047668457, + "logps/chosen": -274.84765625, + "logps/rejected": -240.65109252929688, + "loss": 0.6894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.012807024642825127, + "rewards/margins": 0.08757642656564713, + "rewards/rejected": -0.07476940006017685, + "step": 9780 + }, + { + "epoch": 0.64, + "learning_rate": 1.7231144481397083e-06, + "logits/chosen": -2.3894002437591553, + "logits/rejected": -2.312439441680908, + "logps/chosen": -221.06201171875, + "logps/rejected": -201.44149780273438, + "loss": 0.6904, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.006636572536081076, + "rewards/margins": 0.058181844651699066, + "rewards/rejected": -0.06481841951608658, + "step": 9790 + }, + { + "epoch": 0.64, + "learning_rate": 1.7176892647890092e-06, + "logits/chosen": -2.457059383392334, + "logits/rejected": -2.2145180702209473, + "logps/chosen": -236.02871704101562, + "logps/rejected": -202.4846649169922, + "loss": 0.6915, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0036267780233174562, + "rewards/margins": 0.05337395519018173, + "rewards/rejected": -0.05700073391199112, + "step": 9800 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -2.3302435874938965, + "eval_logits/rejected": -2.1416378021240234, + "eval_logps/chosen": -232.28138732910156, + "eval_logps/rejected": -219.88731384277344, + "eval_loss": 0.6898448467254639, + "eval_rewards/accuracies": 0.6420000195503235, + "eval_rewards/chosen": -0.002764492528513074, + "eval_rewards/margins": 0.07998983561992645, + "eval_rewards/rejected": -0.08275433629751205, + "eval_runtime": 711.5893, + "eval_samples_per_second": 2.811, + "eval_steps_per_second": 1.405, + "step": 9800 + }, + { + "epoch": 0.64, + "learning_rate": 1.7122681623543239e-06, + "logits/chosen": -2.4701716899871826, + "logits/rejected": -2.205655574798584, + "logps/chosen": -245.60400390625, + "logps/rejected": -244.6284637451172, + "loss": 0.6906, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013846084475517273, + "rewards/margins": 0.10238330066204071, + "rewards/rejected": -0.08853721618652344, + "step": 9810 + }, + { + "epoch": 0.64, + "learning_rate": 1.7068511691147788e-06, + "logits/chosen": -2.2074344158172607, + "logits/rejected": -2.2472825050354004, + "logps/chosen": -191.83944702148438, + "logps/rejected": -208.13674926757812, + "loss": 0.6909, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02291598729789257, + "rewards/margins": 0.07927681505680084, + "rewards/rejected": -0.05636082962155342, + "step": 9820 + }, + { + "epoch": 0.64, + "learning_rate": 1.7014383133280636e-06, + "logits/chosen": -2.4390132427215576, + "logits/rejected": -2.075756311416626, + "logps/chosen": -250.9913330078125, + "logps/rejected": -205.5496826171875, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02356928028166294, + "rewards/margins": 0.07952290773391724, + "rewards/rejected": -0.10309220850467682, + "step": 9830 + }, + { + "epoch": 0.64, + "learning_rate": 1.696029623230286e-06, + "logits/chosen": -2.4049618244171143, + "logits/rejected": -2.321725845336914, + "logps/chosen": -255.1918487548828, + "logps/rejected": -282.19464111328125, + "loss": 0.6898, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.007304821163415909, + "rewards/margins": 0.08223724365234375, + "rewards/rejected": -0.07493243366479874, + "step": 9840 + }, + { + "epoch": 0.64, + "learning_rate": 1.6906251270358229e-06, + "logits/chosen": -2.3903801441192627, + "logits/rejected": -2.23978853225708, + "logps/chosen": -264.2168273925781, + "logps/rejected": -220.81857299804688, + "loss": 0.6908, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.013624888844788074, + "rewards/margins": 0.06343577802181244, + "rewards/rejected": -0.07706067711114883, + "step": 9850 + }, + { + "epoch": 0.65, + "learning_rate": 1.685224852937174e-06, + "logits/chosen": -2.1878368854522705, + "logits/rejected": -2.0732967853546143, + "logps/chosen": -197.29331970214844, + "logps/rejected": -254.4430694580078, + "loss": 0.683, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.010714609175920486, + "rewards/margins": 0.15290780365467072, + "rewards/rejected": -0.14219316840171814, + "step": 9860 + }, + { + "epoch": 0.65, + "learning_rate": 1.6798288291048136e-06, + "logits/chosen": -2.151094436645508, + "logits/rejected": -2.0521655082702637, + "logps/chosen": -224.37606811523438, + "logps/rejected": -207.9828643798828, + "loss": 0.6871, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.011344604194164276, + "rewards/margins": 0.12311786413192749, + "rewards/rejected": -0.13446247577667236, + "step": 9870 + }, + { + "epoch": 0.65, + "learning_rate": 1.6744370836870466e-06, + "logits/chosen": -2.555605888366699, + "logits/rejected": -2.2639527320861816, + "logps/chosen": -333.4253845214844, + "logps/rejected": -261.70599365234375, + "loss": 0.687, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0126813855022192, + "rewards/margins": 0.1179838627576828, + "rewards/rejected": -0.10530247539281845, + "step": 9880 + }, + { + "epoch": 0.65, + "learning_rate": 1.6690496448098576e-06, + "logits/chosen": -2.2291011810302734, + "logits/rejected": -1.9320964813232422, + "logps/chosen": -229.88858032226562, + "logps/rejected": -217.7265167236328, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008523445576429367, + "rewards/margins": 0.0869704857468605, + "rewards/rejected": -0.09549392759799957, + "step": 9890 + }, + { + "epoch": 0.65, + "learning_rate": 1.6636665405767666e-06, + "logits/chosen": -2.3363430500030518, + "logits/rejected": -2.170015573501587, + "logps/chosen": -235.27737426757812, + "logps/rejected": -223.20523071289062, + "loss": 0.6894, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02493545040488243, + "rewards/margins": 0.06817851960659027, + "rewards/rejected": -0.043243080377578735, + "step": 9900 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.327425956726074, + "eval_logits/rejected": -2.1391117572784424, + "eval_logps/chosen": -232.06900024414062, + "eval_logps/rejected": -220.34877014160156, + "eval_loss": 0.6898226141929626, + "eval_rewards/accuracies": 0.6434999704360962, + "eval_rewards/chosen": -0.0006405520252883434, + "eval_rewards/margins": 0.08672798424959183, + "eval_rewards/rejected": -0.08736853301525116, + "eval_runtime": 713.8035, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 9900 + }, + { + "epoch": 0.65, + "learning_rate": 1.6582877990686827e-06, + "logits/chosen": -2.330827474594116, + "logits/rejected": -2.275949716567993, + "logps/chosen": -120.868408203125, + "logps/rejected": -154.09080505371094, + "loss": 0.6896, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.004259592853486538, + "rewards/margins": 0.10891057550907135, + "rewards/rejected": -0.10465097427368164, + "step": 9910 + }, + { + "epoch": 0.65, + "learning_rate": 1.6529134483437562e-06, + "logits/chosen": -2.333859920501709, + "logits/rejected": -2.153388738632202, + "logps/chosen": -206.96316528320312, + "logps/rejected": -178.53024291992188, + "loss": 0.6895, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.013889017514884472, + "rewards/margins": 0.11020763963460922, + "rewards/rejected": -0.12409665435552597, + "step": 9920 + }, + { + "epoch": 0.65, + "learning_rate": 1.647543516437233e-06, + "logits/chosen": -2.296292781829834, + "logits/rejected": -2.237565517425537, + "logps/chosen": -202.46505737304688, + "logps/rejected": -229.85952758789062, + "loss": 0.6896, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.034268446266651154, + "rewards/margins": 0.06709616631269455, + "rewards/rejected": -0.1013646125793457, + "step": 9930 + }, + { + "epoch": 0.65, + "learning_rate": 1.6421780313613088e-06, + "logits/chosen": -2.445279121398926, + "logits/rejected": -2.040759563446045, + "logps/chosen": -205.244384765625, + "logps/rejected": -181.84715270996094, + "loss": 0.691, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.005027498118579388, + "rewards/margins": 0.09792643785476685, + "rewards/rejected": -0.10295393317937851, + "step": 9940 + }, + { + "epoch": 0.65, + "learning_rate": 1.6368170211049816e-06, + "logits/chosen": -2.3535354137420654, + "logits/rejected": -1.8584728240966797, + "logps/chosen": -284.0115661621094, + "logps/rejected": -234.0505828857422, + "loss": 0.6883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0021137488074600697, + "rewards/margins": 0.10150531679391861, + "rewards/rejected": -0.1036190614104271, + "step": 9950 + }, + { + "epoch": 0.65, + "learning_rate": 1.6314605136339074e-06, + "logits/chosen": -2.3524794578552246, + "logits/rejected": -2.1899707317352295, + "logps/chosen": -198.07577514648438, + "logps/rejected": -187.40274047851562, + "loss": 0.691, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.024890681728720665, + "rewards/margins": 0.07226412743330002, + "rewards/rejected": -0.09715481102466583, + "step": 9960 + }, + { + "epoch": 0.65, + "learning_rate": 1.6261085368902526e-06, + "logits/chosen": -2.551018238067627, + "logits/rejected": -2.202836751937866, + "logps/chosen": -265.8411865234375, + "logps/rejected": -229.21688842773438, + "loss": 0.6891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.009003100916743279, + "rewards/margins": 0.07005171477794647, + "rewards/rejected": -0.0790548101067543, + "step": 9970 + }, + { + "epoch": 0.65, + "learning_rate": 1.6207611187925503e-06, + "logits/chosen": -2.231933832168579, + "logits/rejected": -2.2483532428741455, + "logps/chosen": -209.96499633789062, + "logps/rejected": -265.6410827636719, + "loss": 0.6867, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011381099000573158, + "rewards/margins": 0.09358282387256622, + "rewards/rejected": -0.10496392101049423, + "step": 9980 + }, + { + "epoch": 0.65, + "learning_rate": 1.6154182872355512e-06, + "logits/chosen": -2.2878527641296387, + "logits/rejected": -2.338225841522217, + "logps/chosen": -167.45220947265625, + "logps/rejected": -189.28424072265625, + "loss": 0.6919, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03634749725461006, + "rewards/margins": 0.07412412762641907, + "rewards/rejected": -0.11047162860631943, + "step": 9990 + }, + { + "epoch": 0.65, + "learning_rate": 1.610080070090084e-06, + "logits/chosen": -2.275801181793213, + "logits/rejected": -2.2091450691223145, + "logps/chosen": -186.84176635742188, + "logps/rejected": -186.8584442138672, + "loss": 0.6897, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.059204794466495514, + "rewards/margins": 0.09045000374317169, + "rewards/rejected": -0.149654820561409, + "step": 10000 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.322659730911255, + "eval_logits/rejected": -2.1345160007476807, + "eval_logps/chosen": -233.91151428222656, + "eval_logps/rejected": -222.27159118652344, + "eval_loss": 0.6898568272590637, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": -0.01906559430062771, + "eval_rewards/margins": 0.08753134310245514, + "eval_rewards/rejected": -0.1065969467163086, + "eval_runtime": 713.7271, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 10000 + }, + { + "epoch": 0.65, + "learning_rate": 1.6047464952029034e-06, + "logits/chosen": -2.4548022747039795, + "logits/rejected": -2.32185697555542, + "logps/chosen": -254.0906219482422, + "logps/rejected": -272.3085021972656, + "loss": 0.6896, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.001967624295502901, + "rewards/margins": 0.09991808235645294, + "rewards/rejected": -0.09795045852661133, + "step": 10010 + }, + { + "epoch": 0.66, + "learning_rate": 1.5994175903965486e-06, + "logits/chosen": -2.183396577835083, + "logits/rejected": -2.0627362728118896, + "logps/chosen": -254.3908233642578, + "logps/rejected": -266.36431884765625, + "loss": 0.6903, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04060421884059906, + "rewards/margins": 0.10219021886587143, + "rewards/rejected": -0.1427944153547287, + "step": 10020 + }, + { + "epoch": 0.66, + "learning_rate": 1.5940933834691977e-06, + "logits/chosen": -2.6259872913360596, + "logits/rejected": -1.9777504205703735, + "logps/chosen": -307.5574951171875, + "logps/rejected": -208.10971069335938, + "loss": 0.6905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.022915838286280632, + "rewards/margins": 0.08449498564004898, + "rewards/rejected": -0.10741082578897476, + "step": 10030 + }, + { + "epoch": 0.66, + "learning_rate": 1.588773902194522e-06, + "logits/chosen": -2.1228013038635254, + "logits/rejected": -1.8389813899993896, + "logps/chosen": -208.42453002929688, + "logps/rejected": -221.58456420898438, + "loss": 0.693, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.04344822093844414, + "rewards/margins": 0.1740039587020874, + "rewards/rejected": -0.21745216846466064, + "step": 10040 + }, + { + "epoch": 0.66, + "learning_rate": 1.583459174321541e-06, + "logits/chosen": -2.1011738777160645, + "logits/rejected": -1.9465796947479248, + "logps/chosen": -213.76220703125, + "logps/rejected": -196.18130493164062, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04144889861345291, + "rewards/margins": 0.09513147920370102, + "rewards/rejected": -0.13658036291599274, + "step": 10050 + }, + { + "epoch": 0.66, + "learning_rate": 1.5781492275744797e-06, + "logits/chosen": -2.5177080631256104, + "logits/rejected": -2.132356882095337, + "logps/chosen": -295.3465881347656, + "logps/rejected": -280.4678039550781, + "loss": 0.6918, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.011021384038031101, + "rewards/margins": 0.11339070647954941, + "rewards/rejected": -0.12441209703683853, + "step": 10060 + }, + { + "epoch": 0.66, + "learning_rate": 1.5728440896526215e-06, + "logits/chosen": -2.2357475757598877, + "logits/rejected": -2.054103374481201, + "logps/chosen": -276.3549499511719, + "logps/rejected": -238.15597534179688, + "loss": 0.6892, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02952994778752327, + "rewards/margins": 0.08003853261470795, + "rewards/rejected": -0.10956847667694092, + "step": 10070 + }, + { + "epoch": 0.66, + "learning_rate": 1.5675437882301633e-06, + "logits/chosen": -2.3144021034240723, + "logits/rejected": -2.1165995597839355, + "logps/chosen": -223.0723419189453, + "logps/rejected": -191.2317352294922, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0402815006673336, + "rewards/margins": 0.02955157496035099, + "rewards/rejected": -0.06983307749032974, + "step": 10080 + }, + { + "epoch": 0.66, + "learning_rate": 1.5622483509560748e-06, + "logits/chosen": -2.194026470184326, + "logits/rejected": -2.2077908515930176, + "logps/chosen": -173.01951599121094, + "logps/rejected": -214.22494506835938, + "loss": 0.6902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.022903580218553543, + "rewards/margins": 0.11023157835006714, + "rewards/rejected": -0.13313516974449158, + "step": 10090 + }, + { + "epoch": 0.66, + "learning_rate": 1.5569578054539506e-06, + "logits/chosen": -2.266801357269287, + "logits/rejected": -1.928422212600708, + "logps/chosen": -279.8221130371094, + "logps/rejected": -209.8824920654297, + "loss": 0.6859, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.0062502906657755375, + "rewards/margins": 0.13923415541648865, + "rewards/rejected": -0.14548444747924805, + "step": 10100 + }, + { + "epoch": 0.66, + "eval_logits/chosen": -2.3166568279266357, + "eval_logits/rejected": -2.12905216217041, + "eval_logps/chosen": -234.25631713867188, + "eval_logps/rejected": -222.29376220703125, + "eval_loss": 0.689877450466156, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": -0.02251364104449749, + "eval_rewards/margins": 0.08430492877960205, + "eval_rewards/rejected": -0.10681857168674469, + "eval_runtime": 714.2219, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 10100 + }, + { + "epoch": 0.66, + "learning_rate": 1.551672179321867e-06, + "logits/chosen": -2.239980459213257, + "logits/rejected": -2.30826735496521, + "logps/chosen": -217.06527709960938, + "logps/rejected": -203.61270141601562, + "loss": 0.6903, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.012083550915122032, + "rewards/margins": 0.07718555629253387, + "rewards/rejected": -0.08926911652088165, + "step": 10110 + }, + { + "epoch": 0.66, + "learning_rate": 1.5463915001322398e-06, + "logits/chosen": -2.280668258666992, + "logits/rejected": -2.135650157928467, + "logps/chosen": -249.37588500976562, + "logps/rejected": -242.8144073486328, + "loss": 0.6881, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03719538077712059, + "rewards/margins": 0.07806181162595749, + "rewards/rejected": -0.11525720357894897, + "step": 10120 + }, + { + "epoch": 0.66, + "learning_rate": 1.5411157954316784e-06, + "logits/chosen": -2.2775256633758545, + "logits/rejected": -2.181715250015259, + "logps/chosen": -201.49807739257812, + "logps/rejected": -196.48684692382812, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003794357879087329, + "rewards/margins": 0.08678573369979858, + "rewards/rejected": -0.08299137651920319, + "step": 10130 + }, + { + "epoch": 0.66, + "learning_rate": 1.535845092740843e-06, + "logits/chosen": -2.4420783519744873, + "logits/rejected": -2.282604217529297, + "logps/chosen": -237.28085327148438, + "logps/rejected": -254.278076171875, + "loss": 0.6909, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.00040371305658482015, + "rewards/margins": 0.04601416364312172, + "rewards/rejected": -0.0456104576587677, + "step": 10140 + }, + { + "epoch": 0.66, + "learning_rate": 1.5305794195543005e-06, + "logits/chosen": -2.296903133392334, + "logits/rejected": -2.3559365272521973, + "logps/chosen": -206.9901123046875, + "logps/rejected": -198.97463989257812, + "loss": 0.6891, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0018900141585618258, + "rewards/margins": 0.08915947377681732, + "rewards/rejected": -0.08726945519447327, + "step": 10150 + }, + { + "epoch": 0.66, + "learning_rate": 1.5253188033403816e-06, + "logits/chosen": -2.321024179458618, + "logits/rejected": -2.416475772857666, + "logps/chosen": -171.10528564453125, + "logps/rejected": -191.79251098632812, + "loss": 0.691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0037040214519947767, + "rewards/margins": 0.0402885265648365, + "rewards/rejected": -0.0439925454556942, + "step": 10160 + }, + { + "epoch": 0.67, + "learning_rate": 1.520063271541037e-06, + "logits/chosen": -2.296424150466919, + "logits/rejected": -2.201472282409668, + "logps/chosen": -178.9801483154297, + "logps/rejected": -180.84454345703125, + "loss": 0.6852, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00475387554615736, + "rewards/margins": 0.1316739171743393, + "rewards/rejected": -0.13642781972885132, + "step": 10170 + }, + { + "epoch": 0.67, + "learning_rate": 1.5148128515716954e-06, + "logits/chosen": -2.529043674468994, + "logits/rejected": -1.8911247253417969, + "logps/chosen": -264.5617370605469, + "logps/rejected": -202.96563720703125, + "loss": 0.6876, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.01967450976371765, + "rewards/margins": 0.10778073221445084, + "rewards/rejected": -0.08810621500015259, + "step": 10180 + }, + { + "epoch": 0.67, + "learning_rate": 1.5095675708211197e-06, + "logits/chosen": -2.348248243331909, + "logits/rejected": -2.340000629425049, + "logps/chosen": -194.94430541992188, + "logps/rejected": -222.9235382080078, + "loss": 0.6904, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.03783398121595383, + "rewards/margins": 0.046356312930583954, + "rewards/rejected": -0.08419029414653778, + "step": 10190 + }, + { + "epoch": 0.67, + "learning_rate": 1.504327456651263e-06, + "logits/chosen": -2.28633189201355, + "logits/rejected": -2.213207244873047, + "logps/chosen": -262.3058166503906, + "logps/rejected": -244.20077514648438, + "loss": 0.6904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.009028220549225807, + "rewards/margins": 0.08859656751155853, + "rewards/rejected": -0.09762479364871979, + "step": 10200 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -2.3150970935821533, + "eval_logits/rejected": -2.127429723739624, + "eval_logps/chosen": -231.98062133789062, + "eval_logps/rejected": -220.61843872070312, + "eval_loss": 0.6898481249809265, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": 0.00024335407942999154, + "eval_rewards/margins": 0.09030859917402267, + "eval_rewards/rejected": -0.09006524831056595, + "eval_runtime": 712.515, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.403, + "step": 10200 + }, + { + "epoch": 0.67, + "learning_rate": 1.4990925363971284e-06, + "logits/chosen": -2.370110034942627, + "logits/rejected": -1.957165002822876, + "logps/chosen": -289.9840087890625, + "logps/rejected": -239.0626983642578, + "loss": 0.6878, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.0006083324551582336, + "rewards/margins": 0.17567750811576843, + "rewards/rejected": -0.17628583312034607, + "step": 10210 + }, + { + "epoch": 0.67, + "learning_rate": 1.4938628373666236e-06, + "logits/chosen": -2.2375118732452393, + "logits/rejected": -2.292834997177124, + "logps/chosen": -172.5015106201172, + "logps/rejected": -181.63931274414062, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0007777426508255303, + "rewards/margins": 0.07265160232782364, + "rewards/rejected": -0.07187385857105255, + "step": 10220 + }, + { + "epoch": 0.67, + "learning_rate": 1.4886383868404203e-06, + "logits/chosen": -2.143146514892578, + "logits/rejected": -2.0093531608581543, + "logps/chosen": -163.1758575439453, + "logps/rejected": -161.16677856445312, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019540909677743912, + "rewards/margins": 0.08020851016044617, + "rewards/rejected": -0.09974941611289978, + "step": 10230 + }, + { + "epoch": 0.67, + "learning_rate": 1.483419212071813e-06, + "logits/chosen": -2.097904920578003, + "logits/rejected": -1.9259834289550781, + "logps/chosen": -188.40487670898438, + "logps/rejected": -189.32362365722656, + "loss": 0.6895, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.000592652999330312, + "rewards/margins": 0.0810011625289917, + "rewards/rejected": -0.08040851354598999, + "step": 10240 + }, + { + "epoch": 0.67, + "learning_rate": 1.478205340286573e-06, + "logits/chosen": -2.2346534729003906, + "logits/rejected": -2.2042670249938965, + "logps/chosen": -209.13229370117188, + "logps/rejected": -199.13385009765625, + "loss": 0.6894, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05523378401994705, + "rewards/margins": 0.08006395399570465, + "rewards/rejected": -0.1352977454662323, + "step": 10250 + }, + { + "epoch": 0.67, + "learning_rate": 1.4729967986828104e-06, + "logits/chosen": -2.3945605754852295, + "logits/rejected": -2.156639575958252, + "logps/chosen": -321.37115478515625, + "logps/rejected": -282.4629211425781, + "loss": 0.69, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006125994957983494, + "rewards/margins": 0.0905977264046669, + "rewards/rejected": -0.08447173237800598, + "step": 10260 + }, + { + "epoch": 0.67, + "learning_rate": 1.4677936144308286e-06, + "logits/chosen": -2.3705971240997314, + "logits/rejected": -2.0719449520111084, + "logps/chosen": -222.4903564453125, + "logps/rejected": -207.2783966064453, + "loss": 0.6883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.017543844878673553, + "rewards/margins": 0.12851087749004364, + "rewards/rejected": -0.11096702516078949, + "step": 10270 + }, + { + "epoch": 0.67, + "learning_rate": 1.4625958146729864e-06, + "logits/chosen": -2.386382818222046, + "logits/rejected": -2.2117514610290527, + "logps/chosen": -220.2921142578125, + "logps/rejected": -212.52377319335938, + "loss": 0.6898, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00028302668943069875, + "rewards/margins": 0.08179637044668198, + "rewards/rejected": -0.08207939565181732, + "step": 10280 + }, + { + "epoch": 0.67, + "learning_rate": 1.4574034265235523e-06, + "logits/chosen": -2.474156141281128, + "logits/rejected": -1.9117343425750732, + "logps/chosen": -254.64987182617188, + "logps/rejected": -168.7136688232422, + "loss": 0.6904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.011786472983658314, + "rewards/margins": 0.11826670169830322, + "rewards/rejected": -0.10648022592067719, + "step": 10290 + }, + { + "epoch": 0.67, + "learning_rate": 1.452216477068568e-06, + "logits/chosen": -2.342738389968872, + "logits/rejected": -1.8315894603729248, + "logps/chosen": -229.30612182617188, + "logps/rejected": -147.89767456054688, + "loss": 0.6876, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.007388654164969921, + "rewards/margins": 0.11506316810846329, + "rewards/rejected": -0.10767451673746109, + "step": 10300 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -2.3180654048919678, + "eval_logits/rejected": -2.130103826522827, + "eval_logps/chosen": -231.863525390625, + "eval_logps/rejected": -219.8981475830078, + "eval_loss": 0.6898233294487, + "eval_rewards/accuracies": 0.6434999704360962, + "eval_rewards/chosen": 0.0014140387065708637, + "eval_rewards/margins": 0.08427631109952927, + "eval_rewards/rejected": -0.08286228775978088, + "eval_runtime": 713.5886, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.401, + "step": 10300 + }, + { + "epoch": 0.67, + "learning_rate": 1.4470349933657004e-06, + "logits/chosen": -2.5185744762420654, + "logits/rejected": -2.2852203845977783, + "logps/chosen": -220.93069458007812, + "logps/rejected": -204.84841918945312, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.002010664436966181, + "rewards/margins": 0.07896588742733002, + "rewards/rejected": -0.08097656071186066, + "step": 10310 + }, + { + "epoch": 0.68, + "learning_rate": 1.4418590024441096e-06, + "logits/chosen": -2.41025972366333, + "logits/rejected": -2.01489520072937, + "logps/chosen": -250.54312133789062, + "logps/rejected": -192.83309936523438, + "loss": 0.6889, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.029171252623200417, + "rewards/margins": 0.08775301277637482, + "rewards/rejected": -0.05858175829052925, + "step": 10320 + }, + { + "epoch": 0.68, + "learning_rate": 1.436688531304297e-06, + "logits/chosen": -2.404268264770508, + "logits/rejected": -2.0696969032287598, + "logps/chosen": -216.67111206054688, + "logps/rejected": -219.65896606445312, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01656418666243553, + "rewards/margins": 0.08477049320936203, + "rewards/rejected": -0.0682063102722168, + "step": 10330 + }, + { + "epoch": 0.68, + "learning_rate": 1.431523606917974e-06, + "logits/chosen": -2.224674940109253, + "logits/rejected": -2.2193877696990967, + "logps/chosen": -207.0742950439453, + "logps/rejected": -223.9700164794922, + "loss": 0.6865, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.019749853760004044, + "rewards/margins": 0.10281027853488922, + "rewards/rejected": -0.12256012111902237, + "step": 10340 + }, + { + "epoch": 0.68, + "learning_rate": 1.4263642562279162e-06, + "logits/chosen": -2.0139873027801514, + "logits/rejected": -1.9893707036972046, + "logps/chosen": -249.74459838867188, + "logps/rejected": -266.02679443359375, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.002572817262262106, + "rewards/margins": 0.07792092114686966, + "rewards/rejected": -0.0804937332868576, + "step": 10350 + }, + { + "epoch": 0.68, + "learning_rate": 1.4212105061478257e-06, + "logits/chosen": -2.0695881843566895, + "logits/rejected": -2.087667465209961, + "logps/chosen": -229.23046875, + "logps/rejected": -247.36196899414062, + "loss": 0.6886, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006600628606975079, + "rewards/margins": 0.10308027267456055, + "rewards/rejected": -0.09647965431213379, + "step": 10360 + }, + { + "epoch": 0.68, + "learning_rate": 1.4160623835621848e-06, + "logits/chosen": -2.4251503944396973, + "logits/rejected": -2.291647434234619, + "logps/chosen": -235.0546875, + "logps/rejected": -231.79080200195312, + "loss": 0.6901, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.022564858198165894, + "rewards/margins": 0.08214254677295685, + "rewards/rejected": -0.05957768112421036, + "step": 10370 + }, + { + "epoch": 0.68, + "learning_rate": 1.4109199153261249e-06, + "logits/chosen": -2.2098567485809326, + "logits/rejected": -2.1036949157714844, + "logps/chosen": -273.9472961425781, + "logps/rejected": -253.73007202148438, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02047353982925415, + "rewards/margins": 0.1009642630815506, + "rewards/rejected": -0.08049070835113525, + "step": 10380 + }, + { + "epoch": 0.68, + "learning_rate": 1.405783128265278e-06, + "logits/chosen": -2.2883942127227783, + "logits/rejected": -2.265260934829712, + "logps/chosen": -204.190185546875, + "logps/rejected": -207.59140014648438, + "loss": 0.6898, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0005667827790603042, + "rewards/margins": 0.06667406111955643, + "rewards/rejected": -0.06724084168672562, + "step": 10390 + }, + { + "epoch": 0.68, + "learning_rate": 1.4006520491756427e-06, + "logits/chosen": -2.389329433441162, + "logits/rejected": -2.1550679206848145, + "logps/chosen": -194.2144012451172, + "logps/rejected": -140.29342651367188, + "loss": 0.6888, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.020098352804780006, + "rewards/margins": 0.10719966888427734, + "rewards/rejected": -0.08710131794214249, + "step": 10400 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -2.3169994354248047, + "eval_logits/rejected": -2.129032611846924, + "eval_logps/chosen": -230.2224578857422, + "eval_logps/rejected": -218.50982666015625, + "eval_loss": 0.6898374557495117, + "eval_rewards/accuracies": 0.6384999752044678, + "eval_rewards/chosen": 0.017825065180659294, + "eval_rewards/margins": 0.08680439740419388, + "eval_rewards/rejected": -0.06897933036088943, + "eval_runtime": 714.2722, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 10400 + }, + { + "epoch": 0.68, + "learning_rate": 1.39552670482344e-06, + "logits/chosen": -2.207252264022827, + "logits/rejected": -2.280210018157959, + "logps/chosen": -173.74078369140625, + "logps/rejected": -181.15975952148438, + "loss": 0.6907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00858780462294817, + "rewards/margins": 0.07527503371238708, + "rewards/rejected": -0.06668722629547119, + "step": 10410 + }, + { + "epoch": 0.68, + "learning_rate": 1.3904071219449776e-06, + "logits/chosen": -2.2979884147644043, + "logits/rejected": -1.8623745441436768, + "logps/chosen": -196.11106872558594, + "logps/rejected": -119.69913482666016, + "loss": 0.6895, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.037905577570199966, + "rewards/margins": 0.08248183131217957, + "rewards/rejected": -0.0445762574672699, + "step": 10420 + }, + { + "epoch": 0.68, + "learning_rate": 1.3852933272465068e-06, + "logits/chosen": -2.458289384841919, + "logits/rejected": -2.22627592086792, + "logps/chosen": -234.090087890625, + "logps/rejected": -206.09585571289062, + "loss": 0.6914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03770025074481964, + "rewards/margins": 0.07229628413915634, + "rewards/rejected": -0.0345960296690464, + "step": 10430 + }, + { + "epoch": 0.68, + "learning_rate": 1.3801853474040873e-06, + "logits/chosen": -2.254831552505493, + "logits/rejected": -2.187377452850342, + "logps/chosen": -236.34634399414062, + "logps/rejected": -230.05392456054688, + "loss": 0.6891, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021748732775449753, + "rewards/margins": 0.10167870670557022, + "rewards/rejected": -0.07992996275424957, + "step": 10440 + }, + { + "epoch": 0.68, + "learning_rate": 1.3750832090634417e-06, + "logits/chosen": -2.4042470455169678, + "logits/rejected": -2.09578537940979, + "logps/chosen": -181.5123291015625, + "logps/rejected": -175.58468627929688, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04660337418317795, + "rewards/margins": 0.08723708242177963, + "rewards/rejected": -0.040633708238601685, + "step": 10450 + }, + { + "epoch": 0.68, + "learning_rate": 1.3699869388398245e-06, + "logits/chosen": -2.2707431316375732, + "logits/rejected": -2.1249325275421143, + "logps/chosen": -209.69296264648438, + "logps/rejected": -199.55191040039062, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03328350931406021, + "rewards/margins": 0.08312380313873291, + "rewards/rejected": -0.049840297549963, + "step": 10460 + }, + { + "epoch": 0.69, + "learning_rate": 1.3648965633178772e-06, + "logits/chosen": -2.2864489555358887, + "logits/rejected": -2.1960525512695312, + "logps/chosen": -197.67889404296875, + "logps/rejected": -219.58993530273438, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.030532773584127426, + "rewards/margins": 0.10499455034732819, + "rewards/rejected": -0.07446177303791046, + "step": 10470 + }, + { + "epoch": 0.69, + "learning_rate": 1.3598121090514938e-06, + "logits/chosen": -2.275895357131958, + "logits/rejected": -2.1503143310546875, + "logps/chosen": -180.6181640625, + "logps/rejected": -165.47055053710938, + "loss": 0.6887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02389690652489662, + "rewards/margins": 0.08975638449192047, + "rewards/rejected": -0.06585947424173355, + "step": 10480 + }, + { + "epoch": 0.69, + "learning_rate": 1.3547336025636753e-06, + "logits/chosen": -2.2028050422668457, + "logits/rejected": -1.984891653060913, + "logps/chosen": -273.01007080078125, + "logps/rejected": -239.7392120361328, + "loss": 0.6903, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.022864457219839096, + "rewards/margins": 0.08258132636547089, + "rewards/rejected": -0.059716880321502686, + "step": 10490 + }, + { + "epoch": 0.69, + "learning_rate": 1.3496610703464022e-06, + "logits/chosen": -2.351762533187866, + "logits/rejected": -2.1159586906433105, + "logps/chosen": -227.1230926513672, + "logps/rejected": -190.7210693359375, + "loss": 0.6893, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.020914912223815918, + "rewards/margins": 0.09119327366352081, + "rewards/rejected": -0.0702783390879631, + "step": 10500 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -2.320483684539795, + "eval_logits/rejected": -2.132249593734741, + "eval_logps/chosen": -229.9177703857422, + "eval_logps/rejected": -217.9020538330078, + "eval_loss": 0.6898301839828491, + "eval_rewards/accuracies": 0.6395000219345093, + "eval_rewards/chosen": 0.0208718404173851, + "eval_rewards/margins": 0.08377327024936676, + "eval_rewards/rejected": -0.06290142238140106, + "eval_runtime": 713.7053, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 10500 + }, + { + "epoch": 0.69, + "learning_rate": 1.3445945388604848e-06, + "logits/chosen": -2.213752508163452, + "logits/rejected": -1.9681813716888428, + "logps/chosen": -239.2418670654297, + "logps/rejected": -207.7646942138672, + "loss": 0.6897, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.010183857753872871, + "rewards/margins": 0.1168348640203476, + "rewards/rejected": -0.12701871991157532, + "step": 10510 + }, + { + "epoch": 0.69, + "learning_rate": 1.3395340345354358e-06, + "logits/chosen": -2.238507032394409, + "logits/rejected": -2.3673980236053467, + "logps/chosen": -223.3243408203125, + "logps/rejected": -249.09719848632812, + "loss": 0.6879, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003734863130375743, + "rewards/margins": 0.08525262027978897, + "rewards/rejected": -0.08151774853467941, + "step": 10520 + }, + { + "epoch": 0.69, + "learning_rate": 1.334479583769322e-06, + "logits/chosen": -2.470689296722412, + "logits/rejected": -2.1623778343200684, + "logps/chosen": -252.4402313232422, + "logps/rejected": -207.7847137451172, + "loss": 0.6912, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.023985490202903748, + "rewards/margins": 0.05314163491129875, + "rewards/rejected": -0.029156142845749855, + "step": 10530 + }, + { + "epoch": 0.69, + "learning_rate": 1.3294312129286366e-06, + "logits/chosen": -2.312894344329834, + "logits/rejected": -2.193312168121338, + "logps/chosen": -263.58502197265625, + "logps/rejected": -255.36056518554688, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.031435929238796234, + "rewards/margins": 0.05446425825357437, + "rewards/rejected": -0.023028332740068436, + "step": 10540 + }, + { + "epoch": 0.69, + "learning_rate": 1.324388948348153e-06, + "logits/chosen": -2.469510078430176, + "logits/rejected": -2.076355218887329, + "logps/chosen": -283.5762023925781, + "logps/rejected": -210.35165405273438, + "loss": 0.6885, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.016617698594927788, + "rewards/margins": 0.0814189538359642, + "rewards/rejected": -0.06480126827955246, + "step": 10550 + }, + { + "epoch": 0.69, + "learning_rate": 1.319352816330796e-06, + "logits/chosen": -2.5570406913757324, + "logits/rejected": -2.0373148918151855, + "logps/chosen": -281.7043151855469, + "logps/rejected": -184.83396911621094, + "loss": 0.6904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.014928947202861309, + "rewards/margins": 0.09585042297840118, + "rewards/rejected": -0.0809214860200882, + "step": 10560 + }, + { + "epoch": 0.69, + "learning_rate": 1.314322843147494e-06, + "logits/chosen": -2.155149221420288, + "logits/rejected": -2.2533445358276367, + "logps/chosen": -172.06494140625, + "logps/rejected": -234.2112579345703, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.017391042783856392, + "rewards/margins": 0.07278571277856827, + "rewards/rejected": -0.09017674624919891, + "step": 10570 + }, + { + "epoch": 0.69, + "learning_rate": 1.3092990550370526e-06, + "logits/chosen": -2.4301834106445312, + "logits/rejected": -2.0875182151794434, + "logps/chosen": -345.9046325683594, + "logps/rejected": -267.7071228027344, + "loss": 0.6897, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0014622181188315153, + "rewards/margins": 0.07153777033090591, + "rewards/rejected": -0.0729999840259552, + "step": 10580 + }, + { + "epoch": 0.69, + "learning_rate": 1.3042814782060131e-06, + "logits/chosen": -2.3605728149414062, + "logits/rejected": -2.063471794128418, + "logps/chosen": -177.5745849609375, + "logps/rejected": -164.29254150390625, + "loss": 0.6882, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.03928259015083313, + "rewards/margins": 0.10634209215641022, + "rewards/rejected": -0.06705950945615768, + "step": 10590 + }, + { + "epoch": 0.69, + "learning_rate": 1.2992701388285112e-06, + "logits/chosen": -2.371594190597534, + "logits/rejected": -2.116765260696411, + "logps/chosen": -266.95880126953125, + "logps/rejected": -229.24685668945312, + "loss": 0.6893, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.02390308678150177, + "rewards/margins": 0.06760050356388092, + "rewards/rejected": -0.04369742050766945, + "step": 10600 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -2.317068099975586, + "eval_logits/rejected": -2.1291825771331787, + "eval_logps/chosen": -230.43104553222656, + "eval_logps/rejected": -218.47349548339844, + "eval_loss": 0.689825177192688, + "eval_rewards/accuracies": 0.6430000066757202, + "eval_rewards/chosen": 0.015739070251584053, + "eval_rewards/margins": 0.08435513079166412, + "eval_rewards/rejected": -0.06861607730388641, + "eval_runtime": 714.2332, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 10600 + }, + { + "epoch": 0.69, + "learning_rate": 1.29426506304615e-06, + "logits/chosen": -2.2151741981506348, + "logits/rejected": -2.148871898651123, + "logps/chosen": -225.4136199951172, + "logps/rejected": -209.6869354248047, + "loss": 0.6934, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0269019715487957, + "rewards/margins": 0.03751341626048088, + "rewards/rejected": -0.06441538780927658, + "step": 10610 + }, + { + "epoch": 0.69, + "learning_rate": 1.289266276967855e-06, + "logits/chosen": -2.364128589630127, + "logits/rejected": -2.249577760696411, + "logps/chosen": -337.41046142578125, + "logps/rejected": -254.139892578125, + "loss": 0.6915, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.011713572777807713, + "rewards/margins": 0.06887595355510712, + "rewards/rejected": -0.05716238543391228, + "step": 10620 + }, + { + "epoch": 0.7, + "learning_rate": 1.284273806669745e-06, + "logits/chosen": -2.3198752403259277, + "logits/rejected": -2.095829963684082, + "logps/chosen": -241.2201690673828, + "logps/rejected": -268.35296630859375, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018951773643493652, + "rewards/margins": 0.09592192620038986, + "rewards/rejected": -0.1148737221956253, + "step": 10630 + }, + { + "epoch": 0.7, + "learning_rate": 1.2792876781949884e-06, + "logits/chosen": -2.0602927207946777, + "logits/rejected": -1.8088810443878174, + "logps/chosen": -207.2493133544922, + "logps/rejected": -195.7181396484375, + "loss": 0.6884, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.011766849085688591, + "rewards/margins": 0.07786474376916885, + "rewards/rejected": -0.08963160216808319, + "step": 10640 + }, + { + "epoch": 0.7, + "learning_rate": 1.274307917553676e-06, + "logits/chosen": -2.3178839683532715, + "logits/rejected": -2.2395923137664795, + "logps/chosen": -196.51788330078125, + "logps/rejected": -232.0148162841797, + "loss": 0.6905, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.008591088466346264, + "rewards/margins": 0.10897374153137207, + "rewards/rejected": -0.11756483465433121, + "step": 10650 + }, + { + "epoch": 0.7, + "learning_rate": 1.2693345507226767e-06, + "logits/chosen": -2.1067943572998047, + "logits/rejected": -2.191366672515869, + "logps/chosen": -230.4332733154297, + "logps/rejected": -244.55105590820312, + "loss": 0.6875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.004430090077221394, + "rewards/margins": 0.12004270404577255, + "rewards/rejected": -0.12447279691696167, + "step": 10660 + }, + { + "epoch": 0.7, + "learning_rate": 1.2643676036455099e-06, + "logits/chosen": -2.3527565002441406, + "logits/rejected": -2.2609760761260986, + "logps/chosen": -288.02587890625, + "logps/rejected": -241.6444854736328, + "loss": 0.6923, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0017340302001684904, + "rewards/margins": 0.043596215546131134, + "rewards/rejected": -0.045330245047807693, + "step": 10670 + }, + { + "epoch": 0.7, + "learning_rate": 1.259407102232203e-06, + "logits/chosen": -2.452852725982666, + "logits/rejected": -2.0722763538360596, + "logps/chosen": -277.25531005859375, + "logps/rejected": -212.7763671875, + "loss": 0.6883, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0037903212942183018, + "rewards/margins": 0.08465041220188141, + "rewards/rejected": -0.08086008578538895, + "step": 10680 + }, + { + "epoch": 0.7, + "learning_rate": 1.254453072359163e-06, + "logits/chosen": -2.389622211456299, + "logits/rejected": -2.1636927127838135, + "logps/chosen": -232.2378387451172, + "logps/rejected": -217.2246551513672, + "loss": 0.6892, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.017822107300162315, + "rewards/margins": 0.06853047758340836, + "rewards/rejected": -0.05070837587118149, + "step": 10690 + }, + { + "epoch": 0.7, + "learning_rate": 1.2495055398690337e-06, + "logits/chosen": -2.4718260765075684, + "logits/rejected": -2.228008508682251, + "logps/chosen": -228.7391357421875, + "logps/rejected": -225.41854858398438, + "loss": 0.6907, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011152736842632294, + "rewards/margins": 0.05962613224983215, + "rewards/rejected": -0.04847339540719986, + "step": 10700 + }, + { + "epoch": 0.7, + "eval_logits/chosen": -2.3170278072357178, + "eval_logits/rejected": -2.1292710304260254, + "eval_logps/chosen": -230.355224609375, + "eval_logps/rejected": -218.42803955078125, + "eval_loss": 0.6898084878921509, + "eval_rewards/accuracies": 0.6430000066757202, + "eval_rewards/chosen": 0.016497209668159485, + "eval_rewards/margins": 0.08465855568647385, + "eval_rewards/rejected": -0.06816134601831436, + "eval_runtime": 714.9419, + "eval_samples_per_second": 2.797, + "eval_steps_per_second": 1.399, + "step": 10700 + }, + { + "epoch": 0.7, + "learning_rate": 1.2445645305705718e-06, + "logits/chosen": -2.4910426139831543, + "logits/rejected": -2.1560773849487305, + "logps/chosen": -206.70126342773438, + "logps/rejected": -192.34877014160156, + "loss": 0.6913, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0038032480515539646, + "rewards/margins": 0.06290373206138611, + "rewards/rejected": -0.06670697778463364, + "step": 10710 + }, + { + "epoch": 0.7, + "learning_rate": 1.2396300702384995e-06, + "logits/chosen": -2.4755501747131348, + "logits/rejected": -2.2301669120788574, + "logps/chosen": -256.727294921875, + "logps/rejected": -224.15756225585938, + "loss": 0.6921, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.010807778686285019, + "rewards/margins": 0.05095580965280533, + "rewards/rejected": -0.04014802724123001, + "step": 10720 + }, + { + "epoch": 0.7, + "learning_rate": 1.234702184613381e-06, + "logits/chosen": -2.0976624488830566, + "logits/rejected": -2.176741600036621, + "logps/chosen": -212.48825073242188, + "logps/rejected": -221.2858428955078, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.029940495267510414, + "rewards/margins": 0.09133056551218033, + "rewards/rejected": -0.06139007955789566, + "step": 10730 + }, + { + "epoch": 0.7, + "learning_rate": 1.2297808994014793e-06, + "logits/chosen": -2.411856174468994, + "logits/rejected": -2.16345477104187, + "logps/chosen": -282.15887451171875, + "logps/rejected": -253.8297882080078, + "loss": 0.6902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03775983303785324, + "rewards/margins": 0.06563454121351242, + "rewards/rejected": -0.02787470817565918, + "step": 10740 + }, + { + "epoch": 0.7, + "learning_rate": 1.2248662402746314e-06, + "logits/chosen": -2.251694679260254, + "logits/rejected": -2.121366500854492, + "logps/chosen": -191.34400939941406, + "logps/rejected": -191.785400390625, + "loss": 0.6914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.014273548498749733, + "rewards/margins": 0.0655721127986908, + "rewards/rejected": -0.07984566688537598, + "step": 10750 + }, + { + "epoch": 0.7, + "learning_rate": 1.2199582328701045e-06, + "logits/chosen": -2.414492607116699, + "logits/rejected": -1.9229921102523804, + "logps/chosen": -289.251708984375, + "logps/rejected": -257.48602294921875, + "loss": 0.688, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.013818124309182167, + "rewards/margins": 0.08525200188159943, + "rewards/rejected": -0.07143385708332062, + "step": 10760 + }, + { + "epoch": 0.7, + "learning_rate": 1.2150569027904712e-06, + "logits/chosen": -2.3025307655334473, + "logits/rejected": -2.191779851913452, + "logps/chosen": -243.48434448242188, + "logps/rejected": -245.3304901123047, + "loss": 0.6899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.024991249665617943, + "rewards/margins": 0.08149056136608124, + "rewards/rejected": -0.056499313563108444, + "step": 10770 + }, + { + "epoch": 0.71, + "learning_rate": 1.2101622756034688e-06, + "logits/chosen": -2.2971510887145996, + "logits/rejected": -2.281870126724243, + "logps/chosen": -221.53494262695312, + "logps/rejected": -199.281982421875, + "loss": 0.6893, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.012190895155072212, + "rewards/margins": 0.08112286031246185, + "rewards/rejected": -0.0689319595694542, + "step": 10780 + }, + { + "epoch": 0.71, + "learning_rate": 1.2052743768418715e-06, + "logits/chosen": -2.353773832321167, + "logits/rejected": -2.1345012187957764, + "logps/chosen": -243.76358032226562, + "logps/rejected": -218.4962158203125, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04045509174466133, + "rewards/margins": 0.08739558607339859, + "rewards/rejected": -0.046940483152866364, + "step": 10790 + }, + { + "epoch": 0.71, + "learning_rate": 1.2003932320033523e-06, + "logits/chosen": -2.463850498199463, + "logits/rejected": -2.155269145965576, + "logps/chosen": -223.14895629882812, + "logps/rejected": -234.60220336914062, + "loss": 0.6877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.033537622541189194, + "rewards/margins": 0.10821393877267838, + "rewards/rejected": -0.07467631250619888, + "step": 10800 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -2.3170695304870605, + "eval_logits/rejected": -2.129291534423828, + "eval_logps/chosen": -229.36056518554688, + "eval_logps/rejected": -217.1490020751953, + "eval_loss": 0.6898130178451538, + "eval_rewards/accuracies": 0.6434999704360962, + "eval_rewards/chosen": 0.026443878188729286, + "eval_rewards/margins": 0.08181502670049667, + "eval_rewards/rejected": -0.05537115037441254, + "eval_runtime": 714.9768, + "eval_samples_per_second": 2.797, + "eval_steps_per_second": 1.399, + "step": 10800 + }, + { + "epoch": 0.71, + "learning_rate": 1.1955188665503553e-06, + "logits/chosen": -2.1240930557250977, + "logits/rejected": -2.1801092624664307, + "logps/chosen": -205.2183074951172, + "logps/rejected": -190.25765991210938, + "loss": 0.6927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0009462200105190277, + "rewards/margins": 0.0582948699593544, + "rewards/rejected": -0.05924109369516373, + "step": 10810 + }, + { + "epoch": 0.71, + "learning_rate": 1.1906513059099566e-06, + "logits/chosen": -2.3073532581329346, + "logits/rejected": -1.9913088083267212, + "logps/chosen": -227.1727294921875, + "logps/rejected": -229.9739532470703, + "loss": 0.6884, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.04316357523202896, + "rewards/margins": 0.11541776359081268, + "rewards/rejected": -0.07225419580936432, + "step": 10820 + }, + { + "epoch": 0.71, + "learning_rate": 1.185790575473738e-06, + "logits/chosen": -2.247575283050537, + "logits/rejected": -2.0391554832458496, + "logps/chosen": -227.498046875, + "logps/rejected": -188.4443817138672, + "loss": 0.6911, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01635267771780491, + "rewards/margins": 0.08220528066158295, + "rewards/rejected": -0.06585261225700378, + "step": 10830 + }, + { + "epoch": 0.71, + "learning_rate": 1.1809367005976516e-06, + "logits/chosen": -2.3067519664764404, + "logits/rejected": -2.062891721725464, + "logps/chosen": -280.78778076171875, + "logps/rejected": -215.189697265625, + "loss": 0.6912, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.017070407047867775, + "rewards/margins": 0.05449339747428894, + "rewards/rejected": -0.037422992289066315, + "step": 10840 + }, + { + "epoch": 0.71, + "learning_rate": 1.1760897066018842e-06, + "logits/chosen": -2.242995262145996, + "logits/rejected": -2.1077020168304443, + "logps/chosen": -219.93295288085938, + "logps/rejected": -224.17984008789062, + "loss": 0.6896, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.031081268563866615, + "rewards/margins": 0.10243818908929825, + "rewards/rejected": -0.07135690748691559, + "step": 10850 + }, + { + "epoch": 0.71, + "learning_rate": 1.1712496187707327e-06, + "logits/chosen": -2.275620698928833, + "logits/rejected": -2.0290729999542236, + "logps/chosen": -240.3934783935547, + "logps/rejected": -239.35073852539062, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.000892448821105063, + "rewards/margins": 0.13023024797439575, + "rewards/rejected": -0.13112269341945648, + "step": 10860 + }, + { + "epoch": 0.71, + "learning_rate": 1.1664164623524646e-06, + "logits/chosen": -2.244417667388916, + "logits/rejected": -2.084033727645874, + "logps/chosen": -216.13034057617188, + "logps/rejected": -195.35897827148438, + "loss": 0.6901, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.022446701303124428, + "rewards/margins": 0.0748591274023056, + "rewards/rejected": -0.052412427961826324, + "step": 10870 + }, + { + "epoch": 0.71, + "learning_rate": 1.1615902625591926e-06, + "logits/chosen": -2.2117581367492676, + "logits/rejected": -2.1228392124176025, + "logps/chosen": -227.09072875976562, + "logps/rejected": -223.296630859375, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01869513839483261, + "rewards/margins": 0.056195490062236786, + "rewards/rejected": -0.0748906284570694, + "step": 10880 + }, + { + "epoch": 0.71, + "learning_rate": 1.156771044566738e-06, + "logits/chosen": -2.3004848957061768, + "logits/rejected": -2.2247865200042725, + "logps/chosen": -264.3690490722656, + "logps/rejected": -228.86154174804688, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0024559935554862022, + "rewards/margins": 0.0770992636680603, + "rewards/rejected": -0.07464326918125153, + "step": 10890 + }, + { + "epoch": 0.71, + "learning_rate": 1.1519588335145037e-06, + "logits/chosen": -2.2524847984313965, + "logits/rejected": -2.393233060836792, + "logps/chosen": -205.72433471679688, + "logps/rejected": -228.1226806640625, + "loss": 0.6924, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018544599413871765, + "rewards/margins": 0.036989279091358185, + "rewards/rejected": -0.01844467595219612, + "step": 10900 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -2.3110759258270264, + "eval_logits/rejected": -2.123832941055298, + "eval_logps/chosen": -230.8058624267578, + "eval_logps/rejected": -218.31472778320312, + "eval_loss": 0.6898163557052612, + "eval_rewards/accuracies": 0.6384999752044678, + "eval_rewards/chosen": 0.011990930885076523, + "eval_rewards/margins": 0.07901943475008011, + "eval_rewards/rejected": -0.06702849268913269, + "eval_runtime": 712.284, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 10900 + }, + { + "epoch": 0.71, + "learning_rate": 1.1471536545053382e-06, + "logits/chosen": -2.343756914138794, + "logits/rejected": -2.3280563354492188, + "logps/chosen": -206.11807250976562, + "logps/rejected": -222.1238250732422, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02916853502392769, + "rewards/margins": 0.0845358818769455, + "rewards/rejected": -0.05536733940243721, + "step": 10910 + }, + { + "epoch": 0.71, + "learning_rate": 1.1423555326054112e-06, + "logits/chosen": -2.2487311363220215, + "logits/rejected": -1.972808837890625, + "logps/chosen": -279.33294677734375, + "logps/rejected": -221.55191040039062, + "loss": 0.6837, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.035016145557165146, + "rewards/margins": 0.1565500795841217, + "rewards/rejected": -0.12153393030166626, + "step": 10920 + }, + { + "epoch": 0.72, + "learning_rate": 1.1375644928440743e-06, + "logits/chosen": -2.3918063640594482, + "logits/rejected": -1.9648542404174805, + "logps/chosen": -234.3190155029297, + "logps/rejected": -176.64840698242188, + "loss": 0.6883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.023642176762223244, + "rewards/margins": 0.09833236038684845, + "rewards/rejected": -0.07469018548727036, + "step": 10930 + }, + { + "epoch": 0.72, + "learning_rate": 1.1327805602137396e-06, + "logits/chosen": -2.329948663711548, + "logits/rejected": -2.149320602416992, + "logps/chosen": -258.65191650390625, + "logps/rejected": -214.11959838867188, + "loss": 0.6894, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0009221710497513413, + "rewards/margins": 0.08445750176906586, + "rewards/rejected": -0.0835353285074234, + "step": 10940 + }, + { + "epoch": 0.72, + "learning_rate": 1.1280037596697426e-06, + "logits/chosen": -2.181227922439575, + "logits/rejected": -2.1459288597106934, + "logps/chosen": -220.15103149414062, + "logps/rejected": -270.27410888671875, + "loss": 0.6846, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.007314229849725962, + "rewards/margins": 0.13196273148059845, + "rewards/rejected": -0.12464849650859833, + "step": 10950 + }, + { + "epoch": 0.72, + "learning_rate": 1.123234116130216e-06, + "logits/chosen": -2.2392992973327637, + "logits/rejected": -2.20440673828125, + "logps/chosen": -185.53500366210938, + "logps/rejected": -203.2857208251953, + "loss": 0.6889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0035335198044776917, + "rewards/margins": 0.11696387827396393, + "rewards/rejected": -0.11343035846948624, + "step": 10960 + }, + { + "epoch": 0.72, + "learning_rate": 1.1184716544759553e-06, + "logits/chosen": -2.0971415042877197, + "logits/rejected": -2.0736968517303467, + "logps/chosen": -163.72232055664062, + "logps/rejected": -181.16928100585938, + "loss": 0.6905, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.01641870103776455, + "rewards/margins": 0.05032380297780037, + "rewards/rejected": -0.03390509635210037, + "step": 10970 + }, + { + "epoch": 0.72, + "learning_rate": 1.1137163995502948e-06, + "logits/chosen": -2.5290920734405518, + "logits/rejected": -2.2951109409332275, + "logps/chosen": -218.7498779296875, + "logps/rejected": -193.7891387939453, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02785148099064827, + "rewards/margins": 0.07741155475378036, + "rewards/rejected": -0.0495600700378418, + "step": 10980 + }, + { + "epoch": 0.72, + "learning_rate": 1.1089683761589717e-06, + "logits/chosen": -2.1559228897094727, + "logits/rejected": -1.998282790184021, + "logps/chosen": -235.5149383544922, + "logps/rejected": -228.6207275390625, + "loss": 0.6884, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04691413417458534, + "rewards/margins": 0.12020029127597809, + "rewards/rejected": -0.07328616082668304, + "step": 10990 + }, + { + "epoch": 0.72, + "learning_rate": 1.1042276090700044e-06, + "logits/chosen": -2.3137733936309814, + "logits/rejected": -2.258112668991089, + "logps/chosen": -211.1614227294922, + "logps/rejected": -247.2993927001953, + "loss": 0.691, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008593087084591389, + "rewards/margins": 0.05968620628118515, + "rewards/rejected": -0.06827928870916367, + "step": 11000 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -2.312530279159546, + "eval_logits/rejected": -2.125136613845825, + "eval_logps/chosen": -229.34445190429688, + "eval_logps/rejected": -216.98072814941406, + "eval_loss": 0.689818263053894, + "eval_rewards/accuracies": 0.6395000219345093, + "eval_rewards/chosen": 0.026604950428009033, + "eval_rewards/margins": 0.08029335737228394, + "eval_rewards/rejected": -0.0536884069442749, + "eval_runtime": 715.7804, + "eval_samples_per_second": 2.794, + "eval_steps_per_second": 1.397, + "step": 11000 + }, + { + "epoch": 0.72, + "learning_rate": 1.0994941230135536e-06, + "logits/chosen": -2.2787528038024902, + "logits/rejected": -1.983764410018921, + "logps/chosen": -228.21664428710938, + "logps/rejected": -204.8932342529297, + "loss": 0.6872, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.04633352905511856, + "rewards/margins": 0.12736350297927856, + "rewards/rejected": -0.0810299813747406, + "step": 11010 + }, + { + "epoch": 0.72, + "learning_rate": 1.094767942681804e-06, + "logits/chosen": -2.557792901992798, + "logits/rejected": -2.1338038444519043, + "logps/chosen": -249.3724365234375, + "logps/rejected": -218.78439331054688, + "loss": 0.6896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.015034383162856102, + "rewards/margins": 0.0861252024769783, + "rewards/rejected": -0.10115957260131836, + "step": 11020 + }, + { + "epoch": 0.72, + "learning_rate": 1.0900490927288248e-06, + "logits/chosen": -2.0898823738098145, + "logits/rejected": -2.044926166534424, + "logps/chosen": -258.9222106933594, + "logps/rejected": -215.9126739501953, + "loss": 0.6904, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.015460757538676262, + "rewards/margins": 0.06466784328222275, + "rewards/rejected": -0.049207091331481934, + "step": 11030 + }, + { + "epoch": 0.72, + "learning_rate": 1.0853375977704511e-06, + "logits/chosen": -2.3183531761169434, + "logits/rejected": -2.1303043365478516, + "logps/chosen": -237.7437744140625, + "logps/rejected": -181.30435180664062, + "loss": 0.6904, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.026807749643921852, + "rewards/margins": 0.07571287453174591, + "rewards/rejected": -0.04890512302517891, + "step": 11040 + }, + { + "epoch": 0.72, + "learning_rate": 1.0806334823841466e-06, + "logits/chosen": -2.1648404598236084, + "logits/rejected": -2.268681764602661, + "logps/chosen": -239.4142608642578, + "logps/rejected": -274.93853759765625, + "loss": 0.6903, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.002016544807702303, + "rewards/margins": 0.061211831867694855, + "rewards/rejected": -0.05919528007507324, + "step": 11050 + }, + { + "epoch": 0.72, + "learning_rate": 1.0759367711088825e-06, + "logits/chosen": -2.175215244293213, + "logits/rejected": -2.3136703968048096, + "logps/chosen": -180.73880004882812, + "logps/rejected": -219.13919067382812, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.030937161296606064, + "rewards/margins": 0.049232009798288345, + "rewards/rejected": -0.018294844776391983, + "step": 11060 + }, + { + "epoch": 0.72, + "learning_rate": 1.0712474884450056e-06, + "logits/chosen": -2.2868709564208984, + "logits/rejected": -2.0856966972351074, + "logps/chosen": -200.55062866210938, + "logps/rejected": -180.42941284179688, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02282613143324852, + "rewards/margins": 0.08665598928928375, + "rewards/rejected": -0.06382984668016434, + "step": 11070 + }, + { + "epoch": 0.72, + "learning_rate": 1.066565658854112e-06, + "logits/chosen": -2.2322375774383545, + "logits/rejected": -2.195129871368408, + "logps/chosen": -127.13566589355469, + "logps/rejected": -132.34140014648438, + "loss": 0.6895, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.006962375249713659, + "rewards/margins": 0.08438241481781006, + "rewards/rejected": -0.07742004096508026, + "step": 11080 + }, + { + "epoch": 0.73, + "learning_rate": 1.0618913067589165e-06, + "logits/chosen": -2.3321659564971924, + "logits/rejected": -2.115520715713501, + "logps/chosen": -213.48275756835938, + "logps/rejected": -184.6875457763672, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0233194287866354, + "rewards/margins": 0.08664701133966446, + "rewards/rejected": -0.06332757323980331, + "step": 11090 + }, + { + "epoch": 0.73, + "learning_rate": 1.0572244565431313e-06, + "logits/chosen": -2.1922779083251953, + "logits/rejected": -2.075507402420044, + "logps/chosen": -143.26010131835938, + "logps/rejected": -156.20030212402344, + "loss": 0.6903, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02156703732907772, + "rewards/margins": 0.07779018580913544, + "rewards/rejected": -0.05622314661741257, + "step": 11100 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.313178300857544, + "eval_logits/rejected": -2.1258115768432617, + "eval_logps/chosen": -228.88185119628906, + "eval_logps/rejected": -216.5213623046875, + "eval_loss": 0.6898159384727478, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": 0.031231021508574486, + "eval_rewards/margins": 0.08032544702291489, + "eval_rewards/rejected": -0.04909442365169525, + "eval_runtime": 712.9033, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 11100 + }, + { + "epoch": 0.73, + "learning_rate": 1.0525651325513317e-06, + "logits/chosen": -2.316305637359619, + "logits/rejected": -2.294666290283203, + "logps/chosen": -327.4156188964844, + "logps/rejected": -310.8594970703125, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03854013979434967, + "rewards/margins": 0.04599260538816452, + "rewards/rejected": -0.007452460937201977, + "step": 11110 + }, + { + "epoch": 0.73, + "learning_rate": 1.0479133590888351e-06, + "logits/chosen": -2.352234363555908, + "logits/rejected": -2.059476613998413, + "logps/chosen": -252.9855194091797, + "logps/rejected": -235.13720703125, + "loss": 0.6886, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.024694135412573814, + "rewards/margins": 0.0912385955452919, + "rewards/rejected": -0.06654445827007294, + "step": 11120 + }, + { + "epoch": 0.73, + "learning_rate": 1.0432691604215695e-06, + "logits/chosen": -2.231292963027954, + "logits/rejected": -2.14487624168396, + "logps/chosen": -231.04995727539062, + "logps/rejected": -204.88375854492188, + "loss": 0.6913, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0399702712893486, + "rewards/margins": 0.055469810962677, + "rewards/rejected": -0.015499535016715527, + "step": 11130 + }, + { + "epoch": 0.73, + "learning_rate": 1.0386325607759515e-06, + "logits/chosen": -2.2429559230804443, + "logits/rejected": -2.180182456970215, + "logps/chosen": -187.20619201660156, + "logps/rejected": -175.9566650390625, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.041839271783828735, + "rewards/margins": 0.08820907026529312, + "rewards/rejected": -0.04636979475617409, + "step": 11140 + }, + { + "epoch": 0.73, + "learning_rate": 1.0340035843387544e-06, + "logits/chosen": -2.34846830368042, + "logits/rejected": -2.021655797958374, + "logps/chosen": -177.23959350585938, + "logps/rejected": -156.76651000976562, + "loss": 0.6902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.024959605187177658, + "rewards/margins": 0.07109765708446503, + "rewards/rejected": -0.04613804817199707, + "step": 11150 + }, + { + "epoch": 0.73, + "learning_rate": 1.0293822552569887e-06, + "logits/chosen": -2.459784746170044, + "logits/rejected": -2.162053108215332, + "logps/chosen": -248.28750610351562, + "logps/rejected": -202.97152709960938, + "loss": 0.6893, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.035963498055934906, + "rewards/margins": 0.10514490306377411, + "rewards/rejected": -0.0691814124584198, + "step": 11160 + }, + { + "epoch": 0.73, + "learning_rate": 1.0247685976377688e-06, + "logits/chosen": -2.2233853340148926, + "logits/rejected": -2.050699234008789, + "logps/chosen": -179.34494018554688, + "logps/rejected": -148.3097686767578, + "loss": 0.6905, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.023400804027915, + "rewards/margins": 0.07796212285757065, + "rewards/rejected": -0.0545613169670105, + "step": 11170 + }, + { + "epoch": 0.73, + "learning_rate": 1.0201626355481939e-06, + "logits/chosen": -2.4143099784851074, + "logits/rejected": -2.1229655742645264, + "logps/chosen": -214.44876098632812, + "logps/rejected": -169.19232177734375, + "loss": 0.6882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01671757362782955, + "rewards/margins": 0.0830346867442131, + "rewards/rejected": -0.0663171112537384, + "step": 11180 + }, + { + "epoch": 0.73, + "learning_rate": 1.0155643930152192e-06, + "logits/chosen": -2.4184117317199707, + "logits/rejected": -2.33054780960083, + "logps/chosen": -270.9144592285156, + "logps/rejected": -222.2581787109375, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016928743571043015, + "rewards/margins": 0.059895895421504974, + "rewards/rejected": -0.04296715185046196, + "step": 11190 + }, + { + "epoch": 0.73, + "learning_rate": 1.0109738940255286e-06, + "logits/chosen": -2.244631290435791, + "logits/rejected": -1.9772279262542725, + "logps/chosen": -214.3131561279297, + "logps/rejected": -183.52496337890625, + "loss": 0.6918, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02669384703040123, + "rewards/margins": 0.06493046879768372, + "rewards/rejected": -0.038236625492572784, + "step": 11200 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.3133840560913086, + "eval_logits/rejected": -2.126025438308716, + "eval_logps/chosen": -228.95094299316406, + "eval_logps/rejected": -216.6020965576172, + "eval_loss": 0.6898157000541687, + "eval_rewards/accuracies": 0.637499988079071, + "eval_rewards/chosen": 0.03054005466401577, + "eval_rewards/margins": 0.08044183254241943, + "eval_rewards/rejected": -0.04990177974104881, + "eval_runtime": 716.7898, + "eval_samples_per_second": 2.79, + "eval_steps_per_second": 1.395, + "step": 11200 + }, + { + "epoch": 0.73, + "learning_rate": 1.0063911625254155e-06, + "logits/chosen": -2.3134210109710693, + "logits/rejected": -2.174760341644287, + "logps/chosen": -229.67391967773438, + "logps/rejected": -238.20559692382812, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.06248721480369568, + "rewards/margins": 0.0790223479270935, + "rewards/rejected": -0.016535133123397827, + "step": 11210 + }, + { + "epoch": 0.73, + "learning_rate": 1.0018162224206502e-06, + "logits/chosen": -2.220797061920166, + "logits/rejected": -2.130765438079834, + "logps/chosen": -172.0438232421875, + "logps/rejected": -187.99996948242188, + "loss": 0.6883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.024309862405061722, + "rewards/margins": 0.1224609836935997, + "rewards/rejected": -0.09815112501382828, + "step": 11220 + }, + { + "epoch": 0.73, + "learning_rate": 9.97249097576363e-07, + "logits/chosen": -2.467745304107666, + "logits/rejected": -2.184107780456543, + "logps/chosen": -230.2428741455078, + "logps/rejected": -197.06825256347656, + "loss": 0.6876, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.028137261047959328, + "rewards/margins": 0.10317236185073853, + "rewards/rejected": -0.07503510266542435, + "step": 11230 + }, + { + "epoch": 0.74, + "learning_rate": 9.92689811816913e-07, + "logits/chosen": -2.336843967437744, + "logits/rejected": -2.1020307540893555, + "logps/chosen": -213.20925903320312, + "logps/rejected": -173.838134765625, + "loss": 0.6902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0068805525079369545, + "rewards/margins": 0.08031884580850601, + "rewards/rejected": -0.08719939738512039, + "step": 11240 + }, + { + "epoch": 0.74, + "learning_rate": 9.881383889257691e-07, + "logits/chosen": -2.2797598838806152, + "logits/rejected": -2.319304943084717, + "logps/chosen": -165.97084045410156, + "logps/rejected": -236.264404296875, + "loss": 0.6898, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.03326042369008064, + "rewards/margins": 0.06215088814496994, + "rewards/rejected": -0.028890466317534447, + "step": 11250 + }, + { + "epoch": 0.74, + "learning_rate": 9.835948526453817e-07, + "logits/chosen": -2.1445116996765137, + "logits/rejected": -2.3151485919952393, + "logps/chosen": -179.73304748535156, + "logps/rejected": -228.52316284179688, + "loss": 0.6916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0069632395170629025, + "rewards/margins": 0.05118337273597717, + "rewards/rejected": -0.044220130890607834, + "step": 11260 + }, + { + "epoch": 0.74, + "learning_rate": 9.790592266770633e-07, + "logits/chosen": -2.5001702308654785, + "logits/rejected": -2.2158002853393555, + "logps/chosen": -266.3745422363281, + "logps/rejected": -245.5768280029297, + "loss": 0.6901, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02970188483595848, + "rewards/margins": 0.07441006600856781, + "rewards/rejected": -0.044708192348480225, + "step": 11270 + }, + { + "epoch": 0.74, + "learning_rate": 9.745315346808584e-07, + "logits/chosen": -2.18174409866333, + "logits/rejected": -2.029189348220825, + "logps/chosen": -215.11642456054688, + "logps/rejected": -199.79173278808594, + "loss": 0.6894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.039590004831552505, + "rewards/margins": 0.07058382779359818, + "rewards/rejected": -0.03099382482469082, + "step": 11280 + }, + { + "epoch": 0.74, + "learning_rate": 9.70011800275428e-07, + "logits/chosen": -2.236311674118042, + "logits/rejected": -2.1652560234069824, + "logps/chosen": -235.71142578125, + "logps/rejected": -255.64370727539062, + "loss": 0.6884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.018134601414203644, + "rewards/margins": 0.08486216515302658, + "rewards/rejected": -0.06672756373882294, + "step": 11290 + }, + { + "epoch": 0.74, + "learning_rate": 9.655000470379206e-07, + "logits/chosen": -2.1597790718078613, + "logits/rejected": -2.0728516578674316, + "logps/chosen": -209.0076446533203, + "logps/rejected": -208.43862915039062, + "loss": 0.6879, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.013398419134318829, + "rewards/margins": 0.10113543272018433, + "rewards/rejected": -0.08773700892925262, + "step": 11300 + }, + { + "epoch": 0.74, + "eval_logits/chosen": -2.315458297729492, + "eval_logits/rejected": -2.127800226211548, + "eval_logps/chosen": -229.95440673828125, + "eval_logps/rejected": -217.7364959716797, + "eval_loss": 0.6897902488708496, + "eval_rewards/accuracies": 0.6380000114440918, + "eval_rewards/chosen": 0.020505422726273537, + "eval_rewards/margins": 0.08175148069858551, + "eval_rewards/rejected": -0.061246056109666824, + "eval_runtime": 712.1423, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 11300 + }, + { + "epoch": 0.74, + "learning_rate": 9.609962985038517e-07, + "logits/chosen": -2.427191734313965, + "logits/rejected": -2.028752088546753, + "logps/chosen": -213.2774658203125, + "logps/rejected": -200.72789001464844, + "loss": 0.6886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.020835842937231064, + "rewards/margins": 0.12575358152389526, + "rewards/rejected": -0.1049177423119545, + "step": 11310 + }, + { + "epoch": 0.74, + "learning_rate": 9.565005781669786e-07, + "logits/chosen": -2.4781110286712646, + "logits/rejected": -2.1154332160949707, + "logps/chosen": -257.0857238769531, + "logps/rejected": -213.7594451904297, + "loss": 0.6879, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0350492037832737, + "rewards/margins": 0.09986601769924164, + "rewards/rejected": -0.06481683254241943, + "step": 11320 + }, + { + "epoch": 0.74, + "learning_rate": 9.520129094791822e-07, + "logits/chosen": -2.2771661281585693, + "logits/rejected": -2.1108384132385254, + "logps/chosen": -179.57064819335938, + "logps/rejected": -179.90342712402344, + "loss": 0.6887, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0007491965079680085, + "rewards/margins": 0.1053602546453476, + "rewards/rejected": -0.10610946267843246, + "step": 11330 + }, + { + "epoch": 0.74, + "learning_rate": 9.475333158503389e-07, + "logits/chosen": -2.2677841186523438, + "logits/rejected": -1.972190499305725, + "logps/chosen": -215.33609008789062, + "logps/rejected": -175.67623901367188, + "loss": 0.6908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.024861928075551987, + "rewards/margins": 0.0549871027469635, + "rewards/rejected": -0.030125176534056664, + "step": 11340 + }, + { + "epoch": 0.74, + "learning_rate": 9.430618206482053e-07, + "logits/chosen": -2.2450308799743652, + "logits/rejected": -2.185279130935669, + "logps/chosen": -135.9246063232422, + "logps/rejected": -141.6876220703125, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.044983211904764175, + "rewards/margins": 0.05167509242892265, + "rewards/rejected": -0.006691886577755213, + "step": 11350 + }, + { + "epoch": 0.74, + "learning_rate": 9.385984471982892e-07, + "logits/chosen": -2.222777843475342, + "logits/rejected": -1.859256386756897, + "logps/chosen": -213.132080078125, + "logps/rejected": -176.3279571533203, + "loss": 0.6865, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.016937825828790665, + "rewards/margins": 0.13930463790893555, + "rewards/rejected": -0.12236680835485458, + "step": 11360 + }, + { + "epoch": 0.74, + "learning_rate": 9.341432187837343e-07, + "logits/chosen": -2.2822318077087402, + "logits/rejected": -2.191162586212158, + "logps/chosen": -187.1429443359375, + "logps/rejected": -212.93783569335938, + "loss": 0.686, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03160820156335831, + "rewards/margins": 0.10926495492458344, + "rewards/rejected": -0.07765677571296692, + "step": 11370 + }, + { + "epoch": 0.74, + "learning_rate": 9.29696158645193e-07, + "logits/chosen": -2.222790241241455, + "logits/rejected": -2.32561993598938, + "logps/chosen": -218.55014038085938, + "logps/rejected": -259.57879638671875, + "loss": 0.6882, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02552112378180027, + "rewards/margins": 0.12038824707269669, + "rewards/rejected": -0.09486713260412216, + "step": 11380 + }, + { + "epoch": 0.75, + "learning_rate": 9.252572899807111e-07, + "logits/chosen": -2.2863879203796387, + "logits/rejected": -2.305600166320801, + "logps/chosen": -268.54376220703125, + "logps/rejected": -253.93896484375, + "loss": 0.6876, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.020558223128318787, + "rewards/margins": 0.11582138389348984, + "rewards/rejected": -0.09526316076517105, + "step": 11390 + }, + { + "epoch": 0.75, + "learning_rate": 9.208266359456003e-07, + "logits/chosen": -2.3670859336853027, + "logits/rejected": -2.106987237930298, + "logps/chosen": -202.4696807861328, + "logps/rejected": -219.4122772216797, + "loss": 0.6896, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.03336096554994583, + "rewards/margins": 0.07371880859136581, + "rewards/rejected": -0.04035785049200058, + "step": 11400 + }, + { + "epoch": 0.75, + "eval_logits/chosen": -2.31719708442688, + "eval_logits/rejected": -2.1292054653167725, + "eval_logps/chosen": -230.30577087402344, + "eval_logps/rejected": -218.55361938476562, + "eval_loss": 0.6897911429405212, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": 0.01699184998869896, + "eval_rewards/margins": 0.08640897274017334, + "eval_rewards/rejected": -0.06941711902618408, + "eval_runtime": 713.2491, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 11400 + }, + { + "epoch": 0.75, + "learning_rate": 9.164042196523229e-07, + "logits/chosen": -2.4905319213867188, + "logits/rejected": -2.1854054927825928, + "logps/chosen": -187.57745361328125, + "logps/rejected": -185.0448455810547, + "loss": 0.69, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.018236028030514717, + "rewards/margins": 0.1098506823182106, + "rewards/rejected": -0.09161464869976044, + "step": 11410 + }, + { + "epoch": 0.75, + "learning_rate": 9.119900641703696e-07, + "logits/chosen": -2.4532532691955566, + "logits/rejected": -2.1825637817382812, + "logps/chosen": -211.77310180664062, + "logps/rejected": -174.50393676757812, + "loss": 0.6908, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.016732508316636086, + "rewards/margins": 0.09065760672092438, + "rewards/rejected": -0.07392510026693344, + "step": 11420 + }, + { + "epoch": 0.75, + "learning_rate": 9.075841925261364e-07, + "logits/chosen": -2.5480093955993652, + "logits/rejected": -2.3063910007476807, + "logps/chosen": -235.31533813476562, + "logps/rejected": -228.7047882080078, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.026038330048322678, + "rewards/margins": 0.0781828910112381, + "rewards/rejected": -0.05214455723762512, + "step": 11430 + }, + { + "epoch": 0.75, + "learning_rate": 9.031866277028093e-07, + "logits/chosen": -2.2410953044891357, + "logits/rejected": -2.2712674140930176, + "logps/chosen": -190.3566436767578, + "logps/rejected": -225.5463104248047, + "loss": 0.6889, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.007181202061474323, + "rewards/margins": 0.08380020409822464, + "rewards/rejected": -0.07661899924278259, + "step": 11440 + }, + { + "epoch": 0.75, + "learning_rate": 8.987973926402391e-07, + "logits/chosen": -2.2060625553131104, + "logits/rejected": -2.242389440536499, + "logps/chosen": -209.5387420654297, + "logps/rejected": -218.03573608398438, + "loss": 0.6894, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.028008287772536278, + "rewards/margins": 0.09084399789571762, + "rewards/rejected": -0.0628357082605362, + "step": 11450 + }, + { + "epoch": 0.75, + "learning_rate": 8.944165102348273e-07, + "logits/chosen": -2.419955015182495, + "logits/rejected": -2.27038311958313, + "logps/chosen": -153.3720703125, + "logps/rejected": -171.96742248535156, + "loss": 0.6879, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0357503779232502, + "rewards/margins": 0.11337963491678238, + "rewards/rejected": -0.07762926071882248, + "step": 11460 + }, + { + "epoch": 0.75, + "learning_rate": 8.900440033394018e-07, + "logits/chosen": -2.2393195629119873, + "logits/rejected": -2.2905590534210205, + "logps/chosen": -179.89437866210938, + "logps/rejected": -177.8740692138672, + "loss": 0.6904, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02621867135167122, + "rewards/margins": 0.07265409827232361, + "rewards/rejected": -0.04643542319536209, + "step": 11470 + }, + { + "epoch": 0.75, + "learning_rate": 8.856798947631009e-07, + "logits/chosen": -2.263493537902832, + "logits/rejected": -2.2867515087127686, + "logps/chosen": -187.29135131835938, + "logps/rejected": -215.5435791015625, + "loss": 0.6878, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03584372624754906, + "rewards/margins": 0.11300251632928848, + "rewards/rejected": -0.07715878635644913, + "step": 11480 + }, + { + "epoch": 0.75, + "learning_rate": 8.813242072712519e-07, + "logits/chosen": -2.0134589672088623, + "logits/rejected": -1.897157073020935, + "logps/chosen": -162.03453063964844, + "logps/rejected": -171.01966857910156, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.001722379820421338, + "rewards/margins": 0.0784909725189209, + "rewards/rejected": -0.08021334558725357, + "step": 11490 + }, + { + "epoch": 0.75, + "learning_rate": 8.769769635852557e-07, + "logits/chosen": -2.2438220977783203, + "logits/rejected": -2.300549268722534, + "logps/chosen": -213.259521484375, + "logps/rejected": -182.95872497558594, + "loss": 0.6904, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.020482342690229416, + "rewards/margins": 0.06456250697374344, + "rewards/rejected": -0.04408016428351402, + "step": 11500 + }, + { + "epoch": 0.75, + "eval_logits/chosen": -2.3182504177093506, + "eval_logits/rejected": -2.1302804946899414, + "eval_logps/chosen": -230.00030517578125, + "eval_logps/rejected": -217.7165069580078, + "eval_loss": 0.6898082494735718, + "eval_rewards/accuracies": 0.6294999718666077, + "eval_rewards/chosen": 0.02004634030163288, + "eval_rewards/margins": 0.08109237998723984, + "eval_rewards/rejected": -0.06104603409767151, + "eval_runtime": 713.4185, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.402, + "step": 11500 + }, + { + "epoch": 0.75, + "learning_rate": 8.726381863824635e-07, + "logits/chosen": -2.4795243740081787, + "logits/rejected": -2.1508588790893555, + "logps/chosen": -280.99859619140625, + "logps/rejected": -221.33462524414062, + "loss": 0.6901, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04078471660614014, + "rewards/margins": 0.0774230808019638, + "rewards/rejected": -0.036638353019952774, + "step": 11510 + }, + { + "epoch": 0.75, + "learning_rate": 8.683078982960638e-07, + "logits/chosen": -2.23110294342041, + "logits/rejected": -1.9223436117172241, + "logps/chosen": -224.91244506835938, + "logps/rejected": -185.7821044921875, + "loss": 0.6873, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0022748790215700865, + "rewards/margins": 0.1096058264374733, + "rewards/rejected": -0.11188069730997086, + "step": 11520 + }, + { + "epoch": 0.75, + "learning_rate": 8.639861219149584e-07, + "logits/chosen": -2.0781750679016113, + "logits/rejected": -2.1381192207336426, + "logps/chosen": -263.2950134277344, + "logps/rejected": -229.32223510742188, + "loss": 0.6875, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006869921926409006, + "rewards/margins": 0.08951739221811295, + "rewards/rejected": -0.08264746516942978, + "step": 11530 + }, + { + "epoch": 0.76, + "learning_rate": 8.596728797836532e-07, + "logits/chosen": -2.211719274520874, + "logits/rejected": -2.0574100017547607, + "logps/chosen": -212.7632293701172, + "logps/rejected": -255.2969512939453, + "loss": 0.6865, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.025454815477132797, + "rewards/margins": 0.12940728664398193, + "rewards/rejected": -0.10395244508981705, + "step": 11540 + }, + { + "epoch": 0.76, + "learning_rate": 8.553681944021294e-07, + "logits/chosen": -2.264085054397583, + "logits/rejected": -2.2858448028564453, + "logps/chosen": -237.9873809814453, + "logps/rejected": -227.55477905273438, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010966275818645954, + "rewards/margins": 0.08914720267057419, + "rewards/rejected": -0.07818093150854111, + "step": 11550 + }, + { + "epoch": 0.76, + "learning_rate": 8.510720882257365e-07, + "logits/chosen": -2.041898727416992, + "logits/rejected": -2.1856493949890137, + "logps/chosen": -158.22018432617188, + "logps/rejected": -210.814208984375, + "loss": 0.6862, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.030803903937339783, + "rewards/margins": 0.1136208325624466, + "rewards/rejected": -0.08281692862510681, + "step": 11560 + }, + { + "epoch": 0.76, + "learning_rate": 8.467845836650667e-07, + "logits/chosen": -1.929513692855835, + "logits/rejected": -1.977299451828003, + "logps/chosen": -207.1142578125, + "logps/rejected": -222.85317993164062, + "loss": 0.6872, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.011523036286234856, + "rewards/margins": 0.09183812886476517, + "rewards/rejected": -0.08031509816646576, + "step": 11570 + }, + { + "epoch": 0.76, + "learning_rate": 8.425057030858461e-07, + "logits/chosen": -2.1237521171569824, + "logits/rejected": -1.9577020406723022, + "logps/chosen": -155.14407348632812, + "logps/rejected": -192.67381286621094, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.009118120186030865, + "rewards/margins": 0.08506849408149719, + "rewards/rejected": -0.07595036923885345, + "step": 11580 + }, + { + "epoch": 0.76, + "learning_rate": 8.382354688088098e-07, + "logits/chosen": -2.2753641605377197, + "logits/rejected": -2.135500192642212, + "logps/chosen": -157.50479125976562, + "logps/rejected": -181.17617797851562, + "loss": 0.69, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006505739875137806, + "rewards/margins": 0.08248453587293625, + "rewards/rejected": -0.07597880065441132, + "step": 11590 + }, + { + "epoch": 0.76, + "learning_rate": 8.33973903109594e-07, + "logits/chosen": -2.396981954574585, + "logits/rejected": -2.1408324241638184, + "logps/chosen": -216.294677734375, + "logps/rejected": -194.80892944335938, + "loss": 0.6891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0030242991633713245, + "rewards/margins": 0.10075845569372177, + "rewards/rejected": -0.09773416072130203, + "step": 11600 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -2.314687967300415, + "eval_logits/rejected": -2.126920223236084, + "eval_logps/chosen": -231.07017517089844, + "eval_logps/rejected": -219.44680786132812, + "eval_loss": 0.6898018717765808, + "eval_rewards/accuracies": 0.6370000243186951, + "eval_rewards/chosen": 0.009347718209028244, + "eval_rewards/margins": 0.08769676089286804, + "eval_rewards/rejected": -0.0783490464091301, + "eval_runtime": 711.5416, + "eval_samples_per_second": 2.811, + "eval_steps_per_second": 1.405, + "step": 11600 + }, + { + "epoch": 0.76, + "learning_rate": 8.297210282186102e-07, + "logits/chosen": -2.1594557762145996, + "logits/rejected": -2.1344146728515625, + "logps/chosen": -232.7439727783203, + "logps/rejected": -261.56768798828125, + "loss": 0.6891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.027264848351478577, + "rewards/margins": 0.07457095384597778, + "rewards/rejected": -0.10183580219745636, + "step": 11610 + }, + { + "epoch": 0.76, + "learning_rate": 8.254768663209397e-07, + "logits/chosen": -2.260918378829956, + "logits/rejected": -2.054774522781372, + "logps/chosen": -277.40802001953125, + "logps/rejected": -221.30020141601562, + "loss": 0.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01171756163239479, + "rewards/margins": 0.0636371374130249, + "rewards/rejected": -0.05191957205533981, + "step": 11620 + }, + { + "epoch": 0.76, + "learning_rate": 8.212414395562079e-07, + "logits/chosen": -2.1018333435058594, + "logits/rejected": -2.234898090362549, + "logps/chosen": -231.6614990234375, + "logps/rejected": -261.87396240234375, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014473098330199718, + "rewards/margins": 0.06291624903678894, + "rewards/rejected": -0.07738934457302094, + "step": 11630 + }, + { + "epoch": 0.76, + "learning_rate": 8.170147700184775e-07, + "logits/chosen": -2.3055419921875, + "logits/rejected": -2.1873691082000732, + "logps/chosen": -254.23110961914062, + "logps/rejected": -254.5798797607422, + "loss": 0.6906, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.022202350199222565, + "rewards/margins": 0.09281570464372635, + "rewards/rejected": -0.07061335444450378, + "step": 11640 + }, + { + "epoch": 0.76, + "learning_rate": 8.127968797561242e-07, + "logits/chosen": -2.353506565093994, + "logits/rejected": -2.0914809703826904, + "logps/chosen": -223.8273162841797, + "logps/rejected": -212.8360595703125, + "loss": 0.6888, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0014341063797473907, + "rewards/margins": 0.11431723833084106, + "rewards/rejected": -0.11288313567638397, + "step": 11650 + }, + { + "epoch": 0.76, + "learning_rate": 8.085877907717338e-07, + "logits/chosen": -2.246596574783325, + "logits/rejected": -2.2063441276550293, + "logps/chosen": -219.5164794921875, + "logps/rejected": -218.2258758544922, + "loss": 0.6897, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.013950645923614502, + "rewards/margins": 0.10388661921024323, + "rewards/rejected": -0.08993595838546753, + "step": 11660 + }, + { + "epoch": 0.76, + "learning_rate": 8.043875250219732e-07, + "logits/chosen": -2.2046780586242676, + "logits/rejected": -2.141763687133789, + "logps/chosen": -231.0264434814453, + "logps/rejected": -213.81973266601562, + "loss": 0.6909, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.016217241063714027, + "rewards/margins": 0.03862147778272629, + "rewards/rejected": -0.054838716983795166, + "step": 11670 + }, + { + "epoch": 0.76, + "learning_rate": 8.001961044174881e-07, + "logits/chosen": -2.3708043098449707, + "logits/rejected": -2.1657023429870605, + "logps/chosen": -230.3643341064453, + "logps/rejected": -176.6756591796875, + "loss": 0.6937, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01899898424744606, + "rewards/margins": 0.03499449044466019, + "rewards/rejected": -0.053993482142686844, + "step": 11680 + }, + { + "epoch": 0.76, + "learning_rate": 7.960135508227795e-07, + "logits/chosen": -2.3832452297210693, + "logits/rejected": -2.029789686203003, + "logps/chosen": -294.03143310546875, + "logps/rejected": -236.58212280273438, + "loss": 0.691, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.001624174416065216, + "rewards/margins": 0.06866665184497833, + "rewards/rejected": -0.07029082626104355, + "step": 11690 + }, + { + "epoch": 0.77, + "learning_rate": 7.91839886056098e-07, + "logits/chosen": -2.3958792686462402, + "logits/rejected": -2.139774799346924, + "logps/chosen": -285.03851318359375, + "logps/rejected": -261.7726135253906, + "loss": 0.6883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012762362137436867, + "rewards/margins": 0.07304862141609192, + "rewards/rejected": -0.08581098169088364, + "step": 11700 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.31754469871521, + "eval_logits/rejected": -2.1296095848083496, + "eval_logps/chosen": -231.76707458496094, + "eval_logps/rejected": -219.65855407714844, + "eval_loss": 0.6898021697998047, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": 0.002378788311034441, + "eval_rewards/margins": 0.08284525573253632, + "eval_rewards/rejected": -0.08046647161245346, + "eval_runtime": 714.8864, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 11700 + }, + { + "epoch": 0.77, + "learning_rate": 7.876751318893217e-07, + "logits/chosen": -2.2193684577941895, + "logits/rejected": -1.9712486267089844, + "logps/chosen": -236.8195343017578, + "logps/rejected": -224.7071533203125, + "loss": 0.6904, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.010461434721946716, + "rewards/margins": 0.09475782513618469, + "rewards/rejected": -0.08429639786481857, + "step": 11710 + }, + { + "epoch": 0.77, + "learning_rate": 7.8351931004785e-07, + "logits/chosen": -2.2037353515625, + "logits/rejected": -1.8471559286117554, + "logps/chosen": -207.496337890625, + "logps/rejected": -191.63697814941406, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011319467797875404, + "rewards/margins": 0.09023983776569366, + "rewards/rejected": -0.07892037183046341, + "step": 11720 + }, + { + "epoch": 0.77, + "learning_rate": 7.793724422104834e-07, + "logits/chosen": -2.0594654083251953, + "logits/rejected": -2.2132391929626465, + "logps/chosen": -208.775146484375, + "logps/rejected": -288.7984924316406, + "loss": 0.6889, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0004916332545690238, + "rewards/margins": 0.10546108335256577, + "rewards/rejected": -0.10496945679187775, + "step": 11730 + }, + { + "epoch": 0.77, + "learning_rate": 7.752345500093184e-07, + "logits/chosen": -2.360964059829712, + "logits/rejected": -2.3283915519714355, + "logps/chosen": -213.69003295898438, + "logps/rejected": -188.79702758789062, + "loss": 0.6907, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03534995764493942, + "rewards/margins": 0.048714593052864075, + "rewards/rejected": -0.0840645581483841, + "step": 11740 + }, + { + "epoch": 0.77, + "learning_rate": 7.711056550296253e-07, + "logits/chosen": -2.4189441204071045, + "logits/rejected": -2.2487740516662598, + "logps/chosen": -245.80990600585938, + "logps/rejected": -219.0479736328125, + "loss": 0.6919, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.009839094243943691, + "rewards/margins": 0.10263122618198395, + "rewards/rejected": -0.09279213845729828, + "step": 11750 + }, + { + "epoch": 0.77, + "learning_rate": 7.669857788097445e-07, + "logits/chosen": -2.1502368450164795, + "logits/rejected": -1.9063570499420166, + "logps/chosen": -164.16265869140625, + "logps/rejected": -200.6251678466797, + "loss": 0.6881, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.025966918095946312, + "rewards/margins": 0.09837634116411209, + "rewards/rejected": -0.12434325367212296, + "step": 11760 + }, + { + "epoch": 0.77, + "learning_rate": 7.628749428409676e-07, + "logits/chosen": -2.410966157913208, + "logits/rejected": -1.9934518337249756, + "logps/chosen": -239.00985717773438, + "logps/rejected": -188.3817138671875, + "loss": 0.6898, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.02122049406170845, + "rewards/margins": 0.08123396337032318, + "rewards/rejected": -0.10245446860790253, + "step": 11770 + }, + { + "epoch": 0.77, + "learning_rate": 7.587731685674288e-07, + "logits/chosen": -2.293890953063965, + "logits/rejected": -2.332033157348633, + "logps/chosen": -272.982666015625, + "logps/rejected": -288.7724304199219, + "loss": 0.6904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.006056091282516718, + "rewards/margins": 0.07952813804149628, + "rewards/rejected": -0.0734720379114151, + "step": 11780 + }, + { + "epoch": 0.77, + "learning_rate": 7.546804773859931e-07, + "logits/chosen": -2.392029047012329, + "logits/rejected": -2.1629507541656494, + "logps/chosen": -228.03903198242188, + "logps/rejected": -219.1748809814453, + "loss": 0.6891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.009989907965064049, + "rewards/margins": 0.11071814596652985, + "rewards/rejected": -0.12070806324481964, + "step": 11790 + }, + { + "epoch": 0.77, + "learning_rate": 7.505968906461409e-07, + "logits/chosen": -2.295881509780884, + "logits/rejected": -2.148301362991333, + "logps/chosen": -243.2596893310547, + "logps/rejected": -226.50534057617188, + "loss": 0.69, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.023195995017886162, + "rewards/margins": 0.07391957193613052, + "rewards/rejected": -0.09711556136608124, + "step": 11800 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.3191685676574707, + "eval_logits/rejected": -2.1311001777648926, + "eval_logps/chosen": -232.53021240234375, + "eval_logps/rejected": -220.31980895996094, + "eval_loss": 0.6897976398468018, + "eval_rewards/accuracies": 0.640999972820282, + "eval_rewards/chosen": -0.005252572242170572, + "eval_rewards/margins": 0.08182655274868011, + "eval_rewards/rejected": -0.08707911521196365, + "eval_runtime": 712.4035, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 11800 + }, + { + "epoch": 0.77, + "learning_rate": 7.465224296498627e-07, + "logits/chosen": -2.3691840171813965, + "logits/rejected": -1.9838594198226929, + "logps/chosen": -233.9862823486328, + "logps/rejected": -198.72694396972656, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.010271742939949036, + "rewards/margins": 0.058696817606687546, + "rewards/rejected": -0.06896857172250748, + "step": 11810 + }, + { + "epoch": 0.77, + "learning_rate": 7.424571156515412e-07, + "logits/chosen": -2.234841823577881, + "logits/rejected": -2.2243666648864746, + "logps/chosen": -177.17324829101562, + "logps/rejected": -210.3824005126953, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005319344811141491, + "rewards/margins": 0.10532574355602264, + "rewards/rejected": -0.10000641644001007, + "step": 11820 + }, + { + "epoch": 0.77, + "learning_rate": 7.38400969857847e-07, + "logits/chosen": -2.183497905731201, + "logits/rejected": -1.9732284545898438, + "logps/chosen": -189.73611450195312, + "logps/rejected": -206.5034637451172, + "loss": 0.6851, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06593702733516693, + "rewards/margins": 0.13615167140960693, + "rewards/rejected": -0.20208871364593506, + "step": 11830 + }, + { + "epoch": 0.77, + "learning_rate": 7.343540134276225e-07, + "logits/chosen": -2.3149163722991943, + "logits/rejected": -2.23742938041687, + "logps/chosen": -169.7374267578125, + "logps/rejected": -179.54759216308594, + "loss": 0.6905, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.009741841815412045, + "rewards/margins": 0.0791388601064682, + "rewards/rejected": -0.06939703226089478, + "step": 11840 + }, + { + "epoch": 0.78, + "learning_rate": 7.303162674717762e-07, + "logits/chosen": -2.2864696979522705, + "logits/rejected": -1.8954169750213623, + "logps/chosen": -213.3994140625, + "logps/rejected": -165.81307983398438, + "loss": 0.6889, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03171641379594803, + "rewards/margins": 0.08249086141586304, + "rewards/rejected": -0.11420726776123047, + "step": 11850 + }, + { + "epoch": 0.78, + "learning_rate": 7.26287753053167e-07, + "logits/chosen": -2.250080108642578, + "logits/rejected": -2.17789888381958, + "logps/chosen": -267.2779846191406, + "logps/rejected": -273.6764831542969, + "loss": 0.6886, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.023018458858132362, + "rewards/margins": 0.0766671746969223, + "rewards/rejected": -0.09968564659357071, + "step": 11860 + }, + { + "epoch": 0.78, + "learning_rate": 7.222684911865013e-07, + "logits/chosen": -2.3542191982269287, + "logits/rejected": -2.364485263824463, + "logps/chosen": -207.5823516845703, + "logps/rejected": -240.3794708251953, + "loss": 0.6875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0041365777142345905, + "rewards/margins": 0.11111694574356079, + "rewards/rejected": -0.1152535229921341, + "step": 11870 + }, + { + "epoch": 0.78, + "learning_rate": 7.182585028382166e-07, + "logits/chosen": -2.407355785369873, + "logits/rejected": -2.1081418991088867, + "logps/chosen": -276.42279052734375, + "logps/rejected": -252.2656707763672, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0027120530139654875, + "rewards/margins": 0.08292824774980545, + "rewards/rejected": -0.08564029633998871, + "step": 11880 + }, + { + "epoch": 0.78, + "learning_rate": 7.142578089263769e-07, + "logits/chosen": -2.438586473464966, + "logits/rejected": -2.1107470989227295, + "logps/chosen": -319.8658447265625, + "logps/rejected": -268.9220886230469, + "loss": 0.6916, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.014372703619301319, + "rewards/margins": 0.07856186479330063, + "rewards/rejected": -0.09293456375598907, + "step": 11890 + }, + { + "epoch": 0.78, + "learning_rate": 7.102664303205611e-07, + "logits/chosen": -2.3172056674957275, + "logits/rejected": -2.0737314224243164, + "logps/chosen": -221.40579223632812, + "logps/rejected": -211.95523071289062, + "loss": 0.6871, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008432546630501747, + "rewards/margins": 0.08426036685705185, + "rewards/rejected": -0.09269289672374725, + "step": 11900 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -2.3179848194122314, + "eval_logits/rejected": -2.130025625228882, + "eval_logps/chosen": -232.76321411132812, + "eval_logps/rejected": -220.74920654296875, + "eval_loss": 0.6897937059402466, + "eval_rewards/accuracies": 0.640999972820282, + "eval_rewards/chosen": -0.007582689169794321, + "eval_rewards/margins": 0.08379034698009491, + "eval_rewards/rejected": -0.09137304127216339, + "eval_runtime": 712.893, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 11900 + }, + { + "epoch": 0.78, + "learning_rate": 7.062843878417566e-07, + "logits/chosen": -2.4349122047424316, + "logits/rejected": -2.2990574836730957, + "logps/chosen": -222.23208618164062, + "logps/rejected": -196.22946166992188, + "loss": 0.6889, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.012005344964563847, + "rewards/margins": 0.07041595876216888, + "rewards/rejected": -0.05841060355305672, + "step": 11910 + }, + { + "epoch": 0.78, + "learning_rate": 7.023117022622458e-07, + "logits/chosen": -2.355516195297241, + "logits/rejected": -2.0010008811950684, + "logps/chosen": -242.761962890625, + "logps/rejected": -227.82077026367188, + "loss": 0.6904, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04246622696518898, + "rewards/margins": 0.0832718163728714, + "rewards/rejected": -0.12573805451393127, + "step": 11920 + }, + { + "epoch": 0.78, + "learning_rate": 6.983483943055042e-07, + "logits/chosen": -2.251622200012207, + "logits/rejected": -2.0690550804138184, + "logps/chosen": -281.15789794921875, + "logps/rejected": -231.5454864501953, + "loss": 0.6889, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.006913202814757824, + "rewards/margins": 0.07160593569278717, + "rewards/rejected": -0.07851915061473846, + "step": 11930 + }, + { + "epoch": 0.78, + "learning_rate": 6.943944846460859e-07, + "logits/chosen": -2.2984964847564697, + "logits/rejected": -2.2344911098480225, + "logps/chosen": -218.5690460205078, + "logps/rejected": -177.935791015625, + "loss": 0.6924, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.001852800720371306, + "rewards/margins": 0.06108871102333069, + "rewards/rejected": -0.059235911816358566, + "step": 11940 + }, + { + "epoch": 0.78, + "learning_rate": 6.904499939095225e-07, + "logits/chosen": -2.264219045639038, + "logits/rejected": -2.2117581367492676, + "logps/chosen": -222.3902587890625, + "logps/rejected": -217.29684448242188, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00043100136099383235, + "rewards/margins": 0.1023484319448471, + "rewards/rejected": -0.10277943313121796, + "step": 11950 + }, + { + "epoch": 0.78, + "learning_rate": 6.865149426722079e-07, + "logits/chosen": -2.233142852783203, + "logits/rejected": -2.1767783164978027, + "logps/chosen": -274.33160400390625, + "logps/rejected": -252.09359741210938, + "loss": 0.6901, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03146423026919365, + "rewards/margins": 0.0845610499382019, + "rewards/rejected": -0.11602529138326645, + "step": 11960 + }, + { + "epoch": 0.78, + "learning_rate": 6.825893514612985e-07, + "logits/chosen": -2.071587324142456, + "logits/rejected": -2.1735153198242188, + "logps/chosen": -236.6058807373047, + "logps/rejected": -243.20828247070312, + "loss": 0.6887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0003904670593328774, + "rewards/margins": 0.08785964548587799, + "rewards/rejected": -0.08825010061264038, + "step": 11970 + }, + { + "epoch": 0.78, + "learning_rate": 6.786732407546001e-07, + "logits/chosen": -2.084094285964966, + "logits/rejected": -2.0345733165740967, + "logps/chosen": -201.45138549804688, + "logps/rejected": -169.8759765625, + "loss": 0.6893, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.005701950751245022, + "rewards/margins": 0.08140133321285248, + "rewards/rejected": -0.08710329234600067, + "step": 11980 + }, + { + "epoch": 0.78, + "learning_rate": 6.747666309804654e-07, + "logits/chosen": -2.4778218269348145, + "logits/rejected": -2.1540145874023438, + "logps/chosen": -287.5825500488281, + "logps/rejected": -215.3527374267578, + "loss": 0.6906, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.0009467907366342843, + "rewards/margins": 0.07305634021759033, + "rewards/rejected": -0.07210955023765564, + "step": 11990 + }, + { + "epoch": 0.79, + "learning_rate": 6.708695425176831e-07, + "logits/chosen": -2.1013684272766113, + "logits/rejected": -2.102804660797119, + "logps/chosen": -170.0596466064453, + "logps/rejected": -200.979248046875, + "loss": 0.6887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.010208925232291222, + "rewards/margins": 0.1016424298286438, + "rewards/rejected": -0.11185135692358017, + "step": 12000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -2.3212053775787354, + "eval_logits/rejected": -2.132889986038208, + "eval_logps/chosen": -232.20034790039062, + "eval_logps/rejected": -220.30197143554688, + "eval_loss": 0.6897847652435303, + "eval_rewards/accuracies": 0.6420000195503235, + "eval_rewards/chosen": -0.001953852828592062, + "eval_rewards/margins": 0.08494684100151062, + "eval_rewards/rejected": -0.08690068125724792, + "eval_runtime": 712.847, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 12000 + }, + { + "epoch": 0.79, + "learning_rate": 6.669819956953768e-07, + "logits/chosen": -2.1582138538360596, + "logits/rejected": -2.0927162170410156, + "logps/chosen": -170.88861083984375, + "logps/rejected": -186.5122833251953, + "loss": 0.6899, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.005573832895606756, + "rewards/margins": 0.07083684206008911, + "rewards/rejected": -0.07641066610813141, + "step": 12010 + }, + { + "epoch": 0.79, + "learning_rate": 6.631040107928957e-07, + "logits/chosen": -2.4988465309143066, + "logits/rejected": -2.1730918884277344, + "logps/chosen": -268.5477294921875, + "logps/rejected": -180.80491638183594, + "loss": 0.691, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0036344178952276707, + "rewards/margins": 0.08555683493614197, + "rewards/rejected": -0.0891912430524826, + "step": 12020 + }, + { + "epoch": 0.79, + "learning_rate": 6.592356080397072e-07, + "logits/chosen": -2.393764019012451, + "logits/rejected": -1.8318722248077393, + "logps/chosen": -227.7593994140625, + "logps/rejected": -179.75608825683594, + "loss": 0.6901, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0052979374304413795, + "rewards/margins": 0.08460094779729843, + "rewards/rejected": -0.07930301129817963, + "step": 12030 + }, + { + "epoch": 0.79, + "learning_rate": 6.553768076152963e-07, + "logits/chosen": -2.2826318740844727, + "logits/rejected": -2.3857228755950928, + "logps/chosen": -157.96484375, + "logps/rejected": -197.0670928955078, + "loss": 0.6889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0013273532968014479, + "rewards/margins": 0.11998225748538971, + "rewards/rejected": -0.11865489184856415, + "step": 12040 + }, + { + "epoch": 0.79, + "learning_rate": 6.51527629649055e-07, + "logits/chosen": -2.419004201889038, + "logits/rejected": -2.2709298133850098, + "logps/chosen": -254.87680053710938, + "logps/rejected": -231.243408203125, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.012937125749886036, + "rewards/margins": 0.05513680726289749, + "rewards/rejected": -0.06807393580675125, + "step": 12050 + }, + { + "epoch": 0.79, + "learning_rate": 6.476880942201824e-07, + "logits/chosen": -2.5580201148986816, + "logits/rejected": -2.1555469036102295, + "logps/chosen": -238.6651611328125, + "logps/rejected": -189.90174865722656, + "loss": 0.688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.026725223287940025, + "rewards/margins": 0.09860799461603165, + "rewards/rejected": -0.07188276946544647, + "step": 12060 + }, + { + "epoch": 0.79, + "learning_rate": 6.438582213575748e-07, + "logits/chosen": -2.245471477508545, + "logits/rejected": -2.217454195022583, + "logps/chosen": -229.4067840576172, + "logps/rejected": -247.80722045898438, + "loss": 0.6912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.005097637884318829, + "rewards/margins": 0.0725497156381607, + "rewards/rejected": -0.0674520879983902, + "step": 12070 + }, + { + "epoch": 0.79, + "learning_rate": 6.400380310397267e-07, + "logits/chosen": -2.1843056678771973, + "logits/rejected": -2.1890132427215576, + "logps/chosen": -232.0767822265625, + "logps/rejected": -268.4687805175781, + "loss": 0.6922, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00033568107755854726, + "rewards/margins": 0.04980158433318138, + "rewards/rejected": -0.04946590214967728, + "step": 12080 + }, + { + "epoch": 0.79, + "learning_rate": 6.362275431946202e-07, + "logits/chosen": -2.168041467666626, + "logits/rejected": -2.201719284057617, + "logps/chosen": -235.95803833007812, + "logps/rejected": -248.0696563720703, + "loss": 0.6896, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.003955559339374304, + "rewards/margins": 0.06063423305749893, + "rewards/rejected": -0.05667867138981819, + "step": 12090 + }, + { + "epoch": 0.79, + "learning_rate": 6.324267776996285e-07, + "logits/chosen": -2.389530658721924, + "logits/rejected": -2.0050384998321533, + "logps/chosen": -375.6055908203125, + "logps/rejected": -265.6221618652344, + "loss": 0.6881, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0102651696652174, + "rewards/margins": 0.11997060477733612, + "rewards/rejected": -0.13023580610752106, + "step": 12100 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -2.3229787349700928, + "eval_logits/rejected": -2.134584426879883, + "eval_logps/chosen": -231.93675231933594, + "eval_logps/rejected": -219.7613525390625, + "eval_loss": 0.6897767782211304, + "eval_rewards/accuracies": 0.6384999752044678, + "eval_rewards/chosen": 0.0006819861009716988, + "eval_rewards/margins": 0.0821765884757042, + "eval_rewards/rejected": -0.08149460703134537, + "eval_runtime": 714.3235, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 12100 + }, + { + "epoch": 0.79, + "learning_rate": 6.286357543814045e-07, + "logits/chosen": -2.2420029640197754, + "logits/rejected": -2.154069662094116, + "logps/chosen": -193.7761993408203, + "logps/rejected": -272.2079772949219, + "loss": 0.6872, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.002463629934936762, + "rewards/margins": 0.10420586168766022, + "rewards/rejected": -0.10666950047016144, + "step": 12110 + }, + { + "epoch": 0.79, + "learning_rate": 6.248544930157838e-07, + "logits/chosen": -2.3656675815582275, + "logits/rejected": -2.122084617614746, + "logps/chosen": -183.5366973876953, + "logps/rejected": -178.4772186279297, + "loss": 0.6866, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005488743539899588, + "rewards/margins": 0.12721005082130432, + "rewards/rejected": -0.12172132730484009, + "step": 12120 + }, + { + "epoch": 0.79, + "learning_rate": 6.21083013327678e-07, + "logits/chosen": -2.3051021099090576, + "logits/rejected": -2.202974796295166, + "logps/chosen": -300.95269775390625, + "logps/rejected": -251.5149383544922, + "loss": 0.6905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.032880254089832306, + "rewards/margins": 0.06488000601530075, + "rewards/rejected": -0.03199975565075874, + "step": 12130 + }, + { + "epoch": 0.79, + "learning_rate": 6.17321334990973e-07, + "logits/chosen": -2.2721495628356934, + "logits/rejected": -2.188018798828125, + "logps/chosen": -200.94345092773438, + "logps/rejected": -172.30050659179688, + "loss": 0.6913, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0013894874136894941, + "rewards/margins": 0.06594385951757431, + "rewards/rejected": -0.06733334064483643, + "step": 12140 + }, + { + "epoch": 0.79, + "learning_rate": 6.135694776284243e-07, + "logits/chosen": -2.431753635406494, + "logits/rejected": -2.160250425338745, + "logps/chosen": -269.80584716796875, + "logps/rejected": -216.8905487060547, + "loss": 0.6889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.009851393289864063, + "rewards/margins": 0.11856885999441147, + "rewards/rejected": -0.10871747881174088, + "step": 12150 + }, + { + "epoch": 0.8, + "learning_rate": 6.098274608115595e-07, + "logits/chosen": -2.2147669792175293, + "logits/rejected": -2.0826354026794434, + "logps/chosen": -204.7724151611328, + "logps/rejected": -179.89138793945312, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.008782130666077137, + "rewards/margins": 0.037513960152864456, + "rewards/rejected": -0.028731834143400192, + "step": 12160 + }, + { + "epoch": 0.8, + "learning_rate": 6.060953040605697e-07, + "logits/chosen": -2.4421582221984863, + "logits/rejected": -1.913739562034607, + "logps/chosen": -334.14593505859375, + "logps/rejected": -275.56756591796875, + "loss": 0.6909, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04983791708946228, + "rewards/margins": 0.11316549777984619, + "rewards/rejected": -0.0633275955915451, + "step": 12170 + }, + { + "epoch": 0.8, + "learning_rate": 6.023730268442144e-07, + "logits/chosen": -2.23698091506958, + "logits/rejected": -2.057175636291504, + "logps/chosen": -202.0666046142578, + "logps/rejected": -185.1787109375, + "loss": 0.6871, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.027542103081941605, + "rewards/margins": 0.11556919664144516, + "rewards/rejected": -0.08802708983421326, + "step": 12180 + }, + { + "epoch": 0.8, + "learning_rate": 5.986606485797131e-07, + "logits/chosen": -2.234809160232544, + "logits/rejected": -2.023869752883911, + "logps/chosen": -199.58416748046875, + "logps/rejected": -211.5013885498047, + "loss": 0.6893, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.003064130200073123, + "rewards/margins": 0.06851175427436829, + "rewards/rejected": -0.0654476210474968, + "step": 12190 + }, + { + "epoch": 0.8, + "learning_rate": 5.949581886326511e-07, + "logits/chosen": -2.352273941040039, + "logits/rejected": -2.33347749710083, + "logps/chosen": -295.5454406738281, + "logps/rejected": -271.19891357421875, + "loss": 0.6905, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.022466326132416725, + "rewards/margins": 0.0456775538623333, + "rewards/rejected": -0.023211227729916573, + "step": 12200 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.3217251300811768, + "eval_logits/rejected": -2.1334657669067383, + "eval_logps/chosen": -230.84371948242188, + "eval_logps/rejected": -218.59002685546875, + "eval_loss": 0.6897699236869812, + "eval_rewards/accuracies": 0.6340000033378601, + "eval_rewards/chosen": 0.011612382717430592, + "eval_rewards/margins": 0.08139365911483765, + "eval_rewards/rejected": -0.06978128105401993, + "eval_runtime": 713.062, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.402, + "step": 12200 + }, + { + "epoch": 0.8, + "learning_rate": 5.912656663168717e-07, + "logits/chosen": -2.4126431941986084, + "logits/rejected": -2.3553099632263184, + "logps/chosen": -221.67831420898438, + "logps/rejected": -221.48641967773438, + "loss": 0.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.023309772834181786, + "rewards/margins": 0.06180506944656372, + "rewards/rejected": -0.03849529102444649, + "step": 12210 + }, + { + "epoch": 0.8, + "learning_rate": 5.875831008943817e-07, + "logits/chosen": -2.158846616744995, + "logits/rejected": -2.1420199871063232, + "logps/chosen": -177.19949340820312, + "logps/rejected": -160.8948516845703, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0044457814656198025, + "rewards/margins": 0.06295563280582428, + "rewards/rejected": -0.058509863913059235, + "step": 12220 + }, + { + "epoch": 0.8, + "learning_rate": 5.839105115752442e-07, + "logits/chosen": -2.284562110900879, + "logits/rejected": -2.0982518196105957, + "logps/chosen": -224.8004150390625, + "logps/rejected": -193.5330352783203, + "loss": 0.6883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01822907105088234, + "rewards/margins": 0.08740357309579849, + "rewards/rejected": -0.10563264042139053, + "step": 12230 + }, + { + "epoch": 0.8, + "learning_rate": 5.802479175174855e-07, + "logits/chosen": -2.2706997394561768, + "logits/rejected": -2.1211254596710205, + "logps/chosen": -163.5981903076172, + "logps/rejected": -178.07614135742188, + "loss": 0.6896, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.023960810154676437, + "rewards/margins": 0.08550871908664703, + "rewards/rejected": -0.061547912657260895, + "step": 12240 + }, + { + "epoch": 0.8, + "learning_rate": 5.765953378269901e-07, + "logits/chosen": -2.189150333404541, + "logits/rejected": -2.127336025238037, + "logps/chosen": -206.731689453125, + "logps/rejected": -248.9289093017578, + "loss": 0.6869, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0026171256322413683, + "rewards/margins": 0.11153455078601837, + "rewards/rejected": -0.10891741514205933, + "step": 12250 + }, + { + "epoch": 0.8, + "learning_rate": 5.729527915574037e-07, + "logits/chosen": -2.343411922454834, + "logits/rejected": -2.2044577598571777, + "logps/chosen": -220.0668487548828, + "logps/rejected": -229.0086669921875, + "loss": 0.6906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0035170826595276594, + "rewards/margins": 0.08602721989154816, + "rewards/rejected": -0.08251012861728668, + "step": 12260 + }, + { + "epoch": 0.8, + "learning_rate": 5.693202977100304e-07, + "logits/chosen": -2.354865074157715, + "logits/rejected": -2.067964553833008, + "logps/chosen": -168.99929809570312, + "logps/rejected": -172.4113311767578, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011458302848041058, + "rewards/margins": 0.06748346984386444, + "rewards/rejected": -0.05602516978979111, + "step": 12270 + }, + { + "epoch": 0.8, + "learning_rate": 5.656978752337389e-07, + "logits/chosen": -2.3720412254333496, + "logits/rejected": -2.168274402618408, + "logps/chosen": -200.18475341796875, + "logps/rejected": -212.4142303466797, + "loss": 0.6878, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.00529628898948431, + "rewards/margins": 0.10829710960388184, + "rewards/rejected": -0.11359341442584991, + "step": 12280 + }, + { + "epoch": 0.8, + "learning_rate": 5.620855430248581e-07, + "logits/chosen": -2.269085645675659, + "logits/rejected": -2.128979206085205, + "logps/chosen": -160.27426147460938, + "logps/rejected": -168.23971557617188, + "loss": 0.6898, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.017207933589816093, + "rewards/margins": 0.10143836587667465, + "rewards/rejected": -0.0842304453253746, + "step": 12290 + }, + { + "epoch": 0.8, + "learning_rate": 5.584833199270837e-07, + "logits/chosen": -2.33870267868042, + "logits/rejected": -2.1915290355682373, + "logps/chosen": -228.42819213867188, + "logps/rejected": -227.838623046875, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0020407275296747684, + "rewards/margins": 0.07103622704744339, + "rewards/rejected": -0.06899549812078476, + "step": 12300 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.3225796222686768, + "eval_logits/rejected": -2.1341781616210938, + "eval_logps/chosen": -231.32383728027344, + "eval_logps/rejected": -219.537353515625, + "eval_loss": 0.6897599101066589, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": 0.006811096332967281, + "eval_rewards/margins": 0.08606572449207306, + "eval_rewards/rejected": -0.0792546421289444, + "eval_runtime": 712.8422, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 12300 + }, + { + "epoch": 0.81, + "learning_rate": 5.548912247313742e-07, + "logits/chosen": -2.5504541397094727, + "logits/rejected": -2.120837450027466, + "logps/chosen": -284.99554443359375, + "logps/rejected": -243.7223663330078, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.011955673806369305, + "rewards/margins": 0.06302481144666672, + "rewards/rejected": -0.0749804899096489, + "step": 12310 + }, + { + "epoch": 0.81, + "learning_rate": 5.513092761758596e-07, + "logits/chosen": -2.367363452911377, + "logits/rejected": -2.1712703704833984, + "logps/chosen": -271.68951416015625, + "logps/rejected": -208.8588104248047, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0047961072996258736, + "rewards/margins": 0.0415019765496254, + "rewards/rejected": -0.0462980791926384, + "step": 12320 + }, + { + "epoch": 0.81, + "learning_rate": 5.477374929457363e-07, + "logits/chosen": -2.2922816276550293, + "logits/rejected": -2.291826009750366, + "logps/chosen": -201.52459716796875, + "logps/rejected": -188.91586303710938, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.005365630611777306, + "rewards/margins": 0.06563162058591843, + "rewards/rejected": -0.06026599556207657, + "step": 12330 + }, + { + "epoch": 0.81, + "learning_rate": 5.441758936731772e-07, + "logits/chosen": -2.3044772148132324, + "logits/rejected": -2.1925055980682373, + "logps/chosen": -234.58203125, + "logps/rejected": -225.813720703125, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.007632553577423096, + "rewards/margins": 0.09021967649459839, + "rewards/rejected": -0.08258712291717529, + "step": 12340 + }, + { + "epoch": 0.81, + "learning_rate": 5.406244969372273e-07, + "logits/chosen": -2.225092887878418, + "logits/rejected": -2.0609354972839355, + "logps/chosen": -199.0269012451172, + "logps/rejected": -219.9811248779297, + "loss": 0.6867, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006959347520023584, + "rewards/margins": 0.14452257752418518, + "rewards/rejected": -0.13756322860717773, + "step": 12350 + }, + { + "epoch": 0.81, + "learning_rate": 5.370833212637122e-07, + "logits/chosen": -2.2868409156799316, + "logits/rejected": -1.9941694736480713, + "logps/chosen": -218.37545776367188, + "logps/rejected": -213.766357421875, + "loss": 0.6913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0013551099691540003, + "rewards/margins": 0.09176047146320343, + "rewards/rejected": -0.09040535986423492, + "step": 12360 + }, + { + "epoch": 0.81, + "learning_rate": 5.335523851251392e-07, + "logits/chosen": -2.239475727081299, + "logits/rejected": -2.166835069656372, + "logps/chosen": -206.76382446289062, + "logps/rejected": -195.5325469970703, + "loss": 0.6882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0062995306216180325, + "rewards/margins": 0.09566928446292877, + "rewards/rejected": -0.10196882486343384, + "step": 12370 + }, + { + "epoch": 0.81, + "learning_rate": 5.300317069406003e-07, + "logits/chosen": -2.219008445739746, + "logits/rejected": -2.191943407058716, + "logps/chosen": -158.67398071289062, + "logps/rejected": -173.11231994628906, + "loss": 0.687, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.020719021558761597, + "rewards/margins": 0.10671563446521759, + "rewards/rejected": -0.085996612906456, + "step": 12380 + }, + { + "epoch": 0.81, + "learning_rate": 5.265213050756782e-07, + "logits/chosen": -2.44820499420166, + "logits/rejected": -2.2743122577667236, + "logps/chosen": -229.58837890625, + "logps/rejected": -236.1202850341797, + "loss": 0.6896, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.024628793820738792, + "rewards/margins": 0.08663028478622437, + "rewards/rejected": -0.06200150400400162, + "step": 12390 + }, + { + "epoch": 0.81, + "learning_rate": 5.230211978423477e-07, + "logits/chosen": -2.3545405864715576, + "logits/rejected": -2.235017776489258, + "logps/chosen": -220.39273071289062, + "logps/rejected": -212.700439453125, + "loss": 0.6927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01592477597296238, + "rewards/margins": 0.07476507127285004, + "rewards/rejected": -0.09068983793258667, + "step": 12400 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -2.3246352672576904, + "eval_logits/rejected": -2.1361496448516846, + "eval_logps/chosen": -230.83546447753906, + "eval_logps/rejected": -218.64422607421875, + "eval_loss": 0.6897544264793396, + "eval_rewards/accuracies": 0.6349999904632568, + "eval_rewards/chosen": 0.0116947703063488, + "eval_rewards/margins": 0.08201787620782852, + "eval_rewards/rejected": -0.07032310217618942, + "eval_runtime": 715.5021, + "eval_samples_per_second": 2.795, + "eval_steps_per_second": 1.398, + "step": 12400 + }, + { + "epoch": 0.81, + "learning_rate": 5.195314034988835e-07, + "logits/chosen": -2.4760658740997314, + "logits/rejected": -2.2121901512145996, + "logps/chosen": -217.1764678955078, + "logps/rejected": -159.36187744140625, + "loss": 0.6891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.021106228232383728, + "rewards/margins": 0.09224637597799301, + "rewards/rejected": -0.07114015519618988, + "step": 12410 + }, + { + "epoch": 0.81, + "learning_rate": 5.160519402497616e-07, + "logits/chosen": -2.3596584796905518, + "logits/rejected": -2.209519147872925, + "logps/chosen": -230.1166229248047, + "logps/rejected": -234.6483612060547, + "loss": 0.6881, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0006260558729991317, + "rewards/margins": 0.0859164297580719, + "rewards/rejected": -0.08654247969388962, + "step": 12420 + }, + { + "epoch": 0.81, + "learning_rate": 5.125828262455679e-07, + "logits/chosen": -2.256119728088379, + "logits/rejected": -2.056142807006836, + "logps/chosen": -254.59011840820312, + "logps/rejected": -233.541748046875, + "loss": 0.6887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.011229803785681725, + "rewards/margins": 0.09291192889213562, + "rewards/rejected": -0.08168213069438934, + "step": 12430 + }, + { + "epoch": 0.81, + "learning_rate": 5.091240795828992e-07, + "logits/chosen": -2.019127368927002, + "logits/rejected": -2.2054896354675293, + "logps/chosen": -200.9918212890625, + "logps/rejected": -231.53662109375, + "loss": 0.6911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.014986000955104828, + "rewards/margins": 0.10714125633239746, + "rewards/rejected": -0.09215524047613144, + "step": 12440 + }, + { + "epoch": 0.81, + "learning_rate": 5.056757183042732e-07, + "logits/chosen": -2.24973464012146, + "logits/rejected": -2.1625046730041504, + "logps/chosen": -232.5579071044922, + "logps/rejected": -222.85122680664062, + "loss": 0.6896, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.009982489980757236, + "rewards/margins": 0.10170789062976837, + "rewards/rejected": -0.09172537922859192, + "step": 12450 + }, + { + "epoch": 0.82, + "learning_rate": 5.022377603980308e-07, + "logits/chosen": -2.4154021739959717, + "logits/rejected": -2.081878185272217, + "logps/chosen": -249.6466522216797, + "logps/rejected": -192.86483764648438, + "loss": 0.6883, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0036199470050632954, + "rewards/margins": 0.09689504653215408, + "rewards/rejected": -0.10051499307155609, + "step": 12460 + }, + { + "epoch": 0.82, + "learning_rate": 4.988102237982454e-07, + "logits/chosen": -2.36234974861145, + "logits/rejected": -2.286999225616455, + "logps/chosen": -223.90420532226562, + "logps/rejected": -188.7877655029297, + "loss": 0.6927, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.017237504944205284, + "rewards/margins": 0.05458803102374077, + "rewards/rejected": -0.0718255490064621, + "step": 12470 + }, + { + "epoch": 0.82, + "learning_rate": 4.953931263846251e-07, + "logits/chosen": -2.3473331928253174, + "logits/rejected": -2.0705606937408447, + "logps/chosen": -261.57763671875, + "logps/rejected": -227.3242645263672, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0013126353733241558, + "rewards/margins": 0.0970732644200325, + "rewards/rejected": -0.09576062858104706, + "step": 12480 + }, + { + "epoch": 0.82, + "learning_rate": 4.919864859824266e-07, + "logits/chosen": -2.291419267654419, + "logits/rejected": -2.145946979522705, + "logps/chosen": -238.6917724609375, + "logps/rejected": -202.22686767578125, + "loss": 0.691, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01257591973990202, + "rewards/margins": 0.08570267260074615, + "rewards/rejected": -0.09827860444784164, + "step": 12490 + }, + { + "epoch": 0.82, + "learning_rate": 4.885903203623532e-07, + "logits/chosen": -2.481529712677002, + "logits/rejected": -2.0849225521087646, + "logps/chosen": -283.2362365722656, + "logps/rejected": -227.9224090576172, + "loss": 0.6897, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.017776403576135635, + "rewards/margins": 0.0845954492688179, + "rewards/rejected": -0.06681904196739197, + "step": 12500 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -2.325686454772949, + "eval_logits/rejected": -2.137054681777954, + "eval_logps/chosen": -231.05908203125, + "eval_logps/rejected": -218.7409210205078, + "eval_loss": 0.6897637844085693, + "eval_rewards/accuracies": 0.6324999928474426, + "eval_rewards/chosen": 0.009458942338824272, + "eval_rewards/margins": 0.08074919879436493, + "eval_rewards/rejected": -0.07129025459289551, + "eval_runtime": 712.8253, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 12500 + }, + { + "epoch": 0.82, + "learning_rate": 4.852046472404695e-07, + "logits/chosen": -2.4758987426757812, + "logits/rejected": -1.738454818725586, + "logps/chosen": -294.7528381347656, + "logps/rejected": -177.1123504638672, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.031358979642391205, + "rewards/margins": 0.08032914996147156, + "rewards/rejected": -0.04897017404437065, + "step": 12510 + }, + { + "epoch": 0.82, + "learning_rate": 4.818294842781035e-07, + "logits/chosen": -2.3906936645507812, + "logits/rejected": -2.208167552947998, + "logps/chosen": -224.6951141357422, + "logps/rejected": -184.04940795898438, + "loss": 0.6891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0151332076638937, + "rewards/margins": 0.11712169647216797, + "rewards/rejected": -0.10198847949504852, + "step": 12520 + }, + { + "epoch": 0.82, + "learning_rate": 4.784648490817601e-07, + "logits/chosen": -2.388882875442505, + "logits/rejected": -2.094998836517334, + "logps/chosen": -218.9868621826172, + "logps/rejected": -182.89913940429688, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011199096217751503, + "rewards/margins": 0.07170800119638443, + "rewards/rejected": -0.060508906841278076, + "step": 12530 + }, + { + "epoch": 0.82, + "learning_rate": 4.751107592030235e-07, + "logits/chosen": -2.399803400039673, + "logits/rejected": -2.1163930892944336, + "logps/chosen": -167.94920349121094, + "logps/rejected": -162.90350341796875, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01902575045824051, + "rewards/margins": 0.12054232507944107, + "rewards/rejected": -0.10151656717061996, + "step": 12540 + }, + { + "epoch": 0.82, + "learning_rate": 4.717672321384703e-07, + "logits/chosen": -2.294178009033203, + "logits/rejected": -2.0486741065979004, + "logps/chosen": -220.0991973876953, + "logps/rejected": -187.33387756347656, + "loss": 0.6883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03146491199731827, + "rewards/margins": 0.094304159283638, + "rewards/rejected": -0.06283925473690033, + "step": 12550 + }, + { + "epoch": 0.82, + "learning_rate": 4.684342853295748e-07, + "logits/chosen": -2.250814199447632, + "logits/rejected": -2.1270248889923096, + "logps/chosen": -185.5677032470703, + "logps/rejected": -191.38328552246094, + "loss": 0.6884, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02129070833325386, + "rewards/margins": 0.09739609807729721, + "rewards/rejected": -0.07610537856817245, + "step": 12560 + }, + { + "epoch": 0.82, + "learning_rate": 4.651119361626213e-07, + "logits/chosen": -2.5402443408966064, + "logits/rejected": -2.2059483528137207, + "logps/chosen": -238.58627319335938, + "logps/rejected": -199.43728637695312, + "loss": 0.6903, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.025323236361145973, + "rewards/margins": 0.07079877704381943, + "rewards/rejected": -0.045475538820028305, + "step": 12570 + }, + { + "epoch": 0.82, + "learning_rate": 4.618002019686091e-07, + "logits/chosen": -2.3017804622650146, + "logits/rejected": -2.1221537590026855, + "logps/chosen": -278.8421630859375, + "logps/rejected": -230.077880859375, + "loss": 0.6894, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0037017776630818844, + "rewards/margins": 0.07871778309345245, + "rewards/rejected": -0.07501600682735443, + "step": 12580 + }, + { + "epoch": 0.82, + "learning_rate": 4.5849910002316757e-07, + "logits/chosen": -2.3771467208862305, + "logits/rejected": -2.034799575805664, + "logps/chosen": -182.04556274414062, + "logps/rejected": -158.250732421875, + "loss": 0.6908, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.015334153547883034, + "rewards/margins": 0.09469417482614517, + "rewards/rejected": -0.11002832651138306, + "step": 12590 + }, + { + "epoch": 0.82, + "learning_rate": 4.5520864754645984e-07, + "logits/chosen": -2.421297550201416, + "logits/rejected": -2.2816054821014404, + "logps/chosen": -279.0403747558594, + "logps/rejected": -242.0817108154297, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.023093996569514275, + "rewards/margins": 0.06866296380758286, + "rewards/rejected": -0.04556896537542343, + "step": 12600 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -2.3262624740600586, + "eval_logits/rejected": -2.1376304626464844, + "eval_logps/chosen": -231.39772033691406, + "eval_logps/rejected": -219.05184936523438, + "eval_loss": 0.6897605061531067, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": 0.006072386633604765, + "eval_rewards/margins": 0.08047185838222504, + "eval_rewards/rejected": -0.07439946383237839, + "eval_runtime": 714.0825, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.4, + "step": 12600 + }, + { + "epoch": 0.83, + "learning_rate": 4.5192886170309896e-07, + "logits/chosen": -2.250743865966797, + "logits/rejected": -2.1508920192718506, + "logps/chosen": -190.7171630859375, + "logps/rejected": -195.17050170898438, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0005129704950377345, + "rewards/margins": 0.042692478746175766, + "rewards/rejected": -0.04217951372265816, + "step": 12610 + }, + { + "epoch": 0.83, + "learning_rate": 4.486597596020548e-07, + "logits/chosen": -2.3476834297180176, + "logits/rejected": -2.0744900703430176, + "logps/chosen": -219.5320281982422, + "logps/rejected": -190.0487518310547, + "loss": 0.6892, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0004929341375827789, + "rewards/margins": 0.08838485181331635, + "rewards/rejected": -0.08789192140102386, + "step": 12620 + }, + { + "epoch": 0.83, + "learning_rate": 4.454013582965644e-07, + "logits/chosen": -2.2743661403656006, + "logits/rejected": -1.8610731363296509, + "logps/chosen": -263.5701904296875, + "logps/rejected": -219.12197875976562, + "loss": 0.6914, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.001134876860305667, + "rewards/margins": 0.059951018542051315, + "rewards/rejected": -0.06108590215444565, + "step": 12630 + }, + { + "epoch": 0.83, + "learning_rate": 4.4215367478404605e-07, + "logits/chosen": -2.125401020050049, + "logits/rejected": -2.1011133193969727, + "logps/chosen": -272.4917297363281, + "logps/rejected": -303.33233642578125, + "loss": 0.691, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.01731831021606922, + "rewards/margins": 0.06468813121318817, + "rewards/rejected": -0.08200643211603165, + "step": 12640 + }, + { + "epoch": 0.83, + "learning_rate": 4.389167260060068e-07, + "logits/chosen": -2.4071826934814453, + "logits/rejected": -2.16564679145813, + "logps/chosen": -205.21432495117188, + "logps/rejected": -178.94972229003906, + "loss": 0.6879, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03475916385650635, + "rewards/margins": 0.11909898370504379, + "rewards/rejected": -0.08433983474969864, + "step": 12650 + }, + { + "epoch": 0.83, + "learning_rate": 4.356905288479579e-07, + "logits/chosen": -2.2683780193328857, + "logits/rejected": -2.039124011993408, + "logps/chosen": -227.39346313476562, + "logps/rejected": -212.4646453857422, + "loss": 0.6859, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.008726147934794426, + "rewards/margins": 0.13835129141807556, + "rewards/rejected": -0.14707742631435394, + "step": 12660 + }, + { + "epoch": 0.83, + "learning_rate": 4.3247510013932377e-07, + "logits/chosen": -2.217339515686035, + "logits/rejected": -2.0521254539489746, + "logps/chosen": -252.25942993164062, + "logps/rejected": -260.9900817871094, + "loss": 0.6901, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0007096766494214535, + "rewards/margins": 0.08910763263702393, + "rewards/rejected": -0.08981730788946152, + "step": 12670 + }, + { + "epoch": 0.83, + "learning_rate": 4.2927045665335594e-07, + "logits/chosen": -1.9187930822372437, + "logits/rejected": -1.8410171270370483, + "logps/chosen": -167.76393127441406, + "logps/rejected": -172.5765838623047, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0288618765771389, + "rewards/margins": 0.08367923647165298, + "rewards/rejected": -0.11254110187292099, + "step": 12680 + }, + { + "epoch": 0.83, + "learning_rate": 4.260766151070439e-07, + "logits/chosen": -2.1377556324005127, + "logits/rejected": -2.1839041709899902, + "logps/chosen": -224.4342803955078, + "logps/rejected": -222.44125366210938, + "loss": 0.6902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.813423508196138e-06, + "rewards/margins": 0.09267593920230865, + "rewards/rejected": -0.09268374741077423, + "step": 12690 + }, + { + "epoch": 0.83, + "learning_rate": 4.228935921610308e-07, + "logits/chosen": -2.3698036670684814, + "logits/rejected": -2.0076329708099365, + "logps/chosen": -262.02777099609375, + "logps/rejected": -204.87368774414062, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016908859834074974, + "rewards/margins": 0.06897404789924622, + "rewards/rejected": -0.052065182477235794, + "step": 12700 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.3263168334960938, + "eval_logits/rejected": -2.1376240253448486, + "eval_logps/chosen": -231.38568115234375, + "eval_logps/rejected": -219.1470947265625, + "eval_loss": 0.689755916595459, + "eval_rewards/accuracies": 0.6334999799728394, + "eval_rewards/chosen": 0.0061928038485348225, + "eval_rewards/margins": 0.08154484629631042, + "eval_rewards/rejected": -0.07535204291343689, + "eval_runtime": 712.3303, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 12700 + }, + { + "epoch": 0.83, + "learning_rate": 4.1972140441952246e-07, + "logits/chosen": -2.178255081176758, + "logits/rejected": -2.146824359893799, + "logps/chosen": -236.79483032226562, + "logps/rejected": -246.6728973388672, + "loss": 0.6903, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.027884885668754578, + "rewards/margins": 0.07868941873311996, + "rewards/rejected": -0.05080454424023628, + "step": 12710 + }, + { + "epoch": 0.83, + "learning_rate": 4.165600684302046e-07, + "logits/chosen": -2.2848753929138184, + "logits/rejected": -2.33852481842041, + "logps/chosen": -173.56161499023438, + "logps/rejected": -192.32496643066406, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022223882377147675, + "rewards/margins": 0.08805432170629501, + "rewards/rejected": -0.06583045423030853, + "step": 12720 + }, + { + "epoch": 0.83, + "learning_rate": 4.13409600684154e-07, + "logits/chosen": -2.392894744873047, + "logits/rejected": -2.1074211597442627, + "logps/chosen": -215.2464599609375, + "logps/rejected": -195.3648681640625, + "loss": 0.6883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0022683297283947468, + "rewards/margins": 0.09301736205816269, + "rewards/rejected": -0.09528569132089615, + "step": 12730 + }, + { + "epoch": 0.83, + "learning_rate": 4.102700176157548e-07, + "logits/chosen": -2.457080364227295, + "logits/rejected": -2.087562084197998, + "logps/chosen": -324.733642578125, + "logps/rejected": -234.39254760742188, + "loss": 0.6904, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.00021561775065492839, + "rewards/margins": 0.07445430010557175, + "rewards/rejected": -0.07423867285251617, + "step": 12740 + }, + { + "epoch": 0.83, + "learning_rate": 4.0714133560260884e-07, + "logits/chosen": -2.3434674739837646, + "logits/rejected": -2.1800377368927, + "logps/chosen": -259.04107666015625, + "logps/rejected": -207.67910766601562, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004920002073049545, + "rewards/margins": 0.06868889182806015, + "rewards/rejected": -0.0637688934803009, + "step": 12750 + }, + { + "epoch": 0.83, + "learning_rate": 4.0402357096545527e-07, + "logits/chosen": -2.2215254306793213, + "logits/rejected": -2.1798095703125, + "logps/chosen": -250.39193725585938, + "logps/rejected": -248.3852081298828, + "loss": 0.6899, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.013696703128516674, + "rewards/margins": 0.0816536545753479, + "rewards/rejected": -0.06795695424079895, + "step": 12760 + }, + { + "epoch": 0.84, + "learning_rate": 4.0091673996808025e-07, + "logits/chosen": -2.4354679584503174, + "logits/rejected": -2.2336437702178955, + "logps/chosen": -198.1351776123047, + "logps/rejected": -181.49725341796875, + "loss": 0.689, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.010751022025942802, + "rewards/margins": 0.07989239692687988, + "rewards/rejected": -0.09064342081546783, + "step": 12770 + }, + { + "epoch": 0.84, + "learning_rate": 3.9782085881723776e-07, + "logits/chosen": -2.2519314289093018, + "logits/rejected": -2.1054606437683105, + "logps/chosen": -164.67416381835938, + "logps/rejected": -188.26077270507812, + "loss": 0.6886, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.009234304539859295, + "rewards/margins": 0.11619944870471954, + "rewards/rejected": -0.10696514695882797, + "step": 12780 + }, + { + "epoch": 0.84, + "learning_rate": 3.947359436625592e-07, + "logits/chosen": -2.2633697986602783, + "logits/rejected": -2.157527446746826, + "logps/chosen": -232.0320281982422, + "logps/rejected": -205.00436401367188, + "loss": 0.6891, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.02630910649895668, + "rewards/margins": 0.1053650975227356, + "rewards/rejected": -0.07905599474906921, + "step": 12790 + }, + { + "epoch": 0.84, + "learning_rate": 3.9166201059647386e-07, + "logits/chosen": -2.399799346923828, + "logits/rejected": -2.2643110752105713, + "logps/chosen": -259.34234619140625, + "logps/rejected": -227.6080780029297, + "loss": 0.6907, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02507871761918068, + "rewards/margins": 0.051967114210128784, + "rewards/rejected": -0.026888396590948105, + "step": 12800 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.3278772830963135, + "eval_logits/rejected": -2.138990640640259, + "eval_logps/chosen": -230.71701049804688, + "eval_logps/rejected": -218.49427795410156, + "eval_loss": 0.6897500157356262, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": 0.012879305519163609, + "eval_rewards/margins": 0.08170315623283386, + "eval_rewards/rejected": -0.06882384419441223, + "eval_runtime": 711.356, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 12800 + }, + { + "epoch": 0.84, + "learning_rate": 3.8859907565412194e-07, + "logits/chosen": -2.208301544189453, + "logits/rejected": -2.289896249771118, + "logps/chosen": -179.2988739013672, + "logps/rejected": -186.98802185058594, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0035035633482038975, + "rewards/margins": 0.08267536014318466, + "rewards/rejected": -0.0791717916727066, + "step": 12810 + }, + { + "epoch": 0.84, + "learning_rate": 3.8554715481327303e-07, + "logits/chosen": -2.365440845489502, + "logits/rejected": -1.9772167205810547, + "logps/chosen": -233.1007843017578, + "logps/rejected": -207.98318481445312, + "loss": 0.6874, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0015824921429157257, + "rewards/margins": 0.10280168056488037, + "rewards/rejected": -0.10121919214725494, + "step": 12820 + }, + { + "epoch": 0.84, + "learning_rate": 3.8250626399424007e-07, + "logits/chosen": -2.3790652751922607, + "logits/rejected": -2.1290512084960938, + "logps/chosen": -249.7417449951172, + "logps/rejected": -239.1602020263672, + "loss": 0.6902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.017364097759127617, + "rewards/margins": 0.08268658816814423, + "rewards/rejected": -0.06532249599695206, + "step": 12830 + }, + { + "epoch": 0.84, + "learning_rate": 3.7947641905980104e-07, + "logits/chosen": -2.203456163406372, + "logits/rejected": -2.2128920555114746, + "logps/chosen": -205.8812713623047, + "logps/rejected": -179.02224731445312, + "loss": 0.6892, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.014371681027114391, + "rewards/margins": 0.07821665704250336, + "rewards/rejected": -0.0638449639081955, + "step": 12840 + }, + { + "epoch": 0.84, + "learning_rate": 3.764576358151098e-07, + "logits/chosen": -2.238459825515747, + "logits/rejected": -2.233910322189331, + "logps/chosen": -173.391845703125, + "logps/rejected": -167.51803588867188, + "loss": 0.6909, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01425178349018097, + "rewards/margins": 0.059704847633838654, + "rewards/rejected": -0.045453060418367386, + "step": 12850 + }, + { + "epoch": 0.84, + "learning_rate": 3.7344993000761944e-07, + "logits/chosen": -2.349444627761841, + "logits/rejected": -2.242591381072998, + "logps/chosen": -178.75843811035156, + "logps/rejected": -230.89614868164062, + "loss": 0.6892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.014432880096137524, + "rewards/margins": 0.0916595309972763, + "rewards/rejected": -0.10609239339828491, + "step": 12860 + }, + { + "epoch": 0.84, + "learning_rate": 3.7045331732699585e-07, + "logits/chosen": -2.3606972694396973, + "logits/rejected": -2.171160936355591, + "logps/chosen": -203.33251953125, + "logps/rejected": -180.00003051757812, + "loss": 0.6859, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01865537092089653, + "rewards/margins": 0.12366944551467896, + "rewards/rejected": -0.10501406341791153, + "step": 12870 + }, + { + "epoch": 0.84, + "learning_rate": 3.6746781340503993e-07, + "logits/chosen": -2.2035956382751465, + "logits/rejected": -2.104219436645508, + "logps/chosen": -231.5043487548828, + "logps/rejected": -232.2509307861328, + "loss": 0.6872, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.029114311560988426, + "rewards/margins": 0.0877356305718422, + "rewards/rejected": -0.058621324598789215, + "step": 12880 + }, + { + "epoch": 0.84, + "learning_rate": 3.6449343381560116e-07, + "logits/chosen": -2.2976372241973877, + "logits/rejected": -2.071730613708496, + "logps/chosen": -232.3043212890625, + "logps/rejected": -229.9260711669922, + "loss": 0.6893, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.008786660619080067, + "rewards/margins": 0.11011286079883575, + "rewards/rejected": -0.11889950931072235, + "step": 12890 + }, + { + "epoch": 0.84, + "learning_rate": 3.615301940745017e-07, + "logits/chosen": -2.5623362064361572, + "logits/rejected": -1.9647992849349976, + "logps/chosen": -314.986083984375, + "logps/rejected": -217.2484130859375, + "loss": 0.6911, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.022090371698141098, + "rewards/margins": 0.07439263164997101, + "rewards/rejected": -0.05230225995182991, + "step": 12900 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.3258814811706543, + "eval_logits/rejected": -2.1371657848358154, + "eval_logps/chosen": -230.18865966796875, + "eval_logps/rejected": -218.1457061767578, + "eval_loss": 0.6897482872009277, + "eval_rewards/accuracies": 0.6334999799728394, + "eval_rewards/chosen": 0.0181629229336977, + "eval_rewards/margins": 0.0835009291768074, + "eval_rewards/rejected": -0.06533800810575485, + "eval_runtime": 710.8607, + "eval_samples_per_second": 2.813, + "eval_steps_per_second": 1.407, + "step": 12900 + }, + { + "epoch": 0.84, + "learning_rate": 3.5857810963945084e-07, + "logits/chosen": -2.1857872009277344, + "logits/rejected": -1.9668527841567993, + "logps/chosen": -214.0235595703125, + "logps/rejected": -207.2059783935547, + "loss": 0.6891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.004855555482208729, + "rewards/margins": 0.08005331456661224, + "rewards/rejected": -0.07519775629043579, + "step": 12910 + }, + { + "epoch": 0.85, + "learning_rate": 3.556371959099678e-07, + "logits/chosen": -2.3944733142852783, + "logits/rejected": -2.1525301933288574, + "logps/chosen": -294.125732421875, + "logps/rejected": -261.60772705078125, + "loss": 0.6911, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.030485982075333595, + "rewards/margins": 0.07452499121427536, + "rewards/rejected": -0.04403900355100632, + "step": 12920 + }, + { + "epoch": 0.85, + "learning_rate": 3.5270746822729797e-07, + "logits/chosen": -2.280972957611084, + "logits/rejected": -2.2035224437713623, + "logps/chosen": -246.06930541992188, + "logps/rejected": -267.5752258300781, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.014127634465694427, + "rewards/margins": 0.08930746465921402, + "rewards/rejected": -0.07517983019351959, + "step": 12930 + }, + { + "epoch": 0.85, + "learning_rate": 3.4978894187433746e-07, + "logits/chosen": -2.3760852813720703, + "logits/rejected": -2.23614501953125, + "logps/chosen": -152.7420196533203, + "logps/rejected": -146.85598754882812, + "loss": 0.6902, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.016667893156409264, + "rewards/margins": 0.05022420361638069, + "rewards/rejected": -0.0668920949101448, + "step": 12940 + }, + { + "epoch": 0.85, + "learning_rate": 3.468816320755486e-07, + "logits/chosen": -2.163353443145752, + "logits/rejected": -1.9819806814193726, + "logps/chosen": -220.0610809326172, + "logps/rejected": -186.81573486328125, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03866659849882126, + "rewards/margins": 0.06364385038614273, + "rewards/rejected": -0.02497725561261177, + "step": 12950 + }, + { + "epoch": 0.85, + "learning_rate": 3.4398555399688336e-07, + "logits/chosen": -2.425262212753296, + "logits/rejected": -2.0546271800994873, + "logps/chosen": -211.6660919189453, + "logps/rejected": -196.01766967773438, + "loss": 0.6918, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0031560491770505905, + "rewards/margins": 0.05015747994184494, + "rewards/rejected": -0.05331353470683098, + "step": 12960 + }, + { + "epoch": 0.85, + "learning_rate": 3.411007227457047e-07, + "logits/chosen": -2.327322006225586, + "logits/rejected": -2.270181179046631, + "logps/chosen": -244.98068237304688, + "logps/rejected": -223.50830078125, + "loss": 0.6875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02780618704855442, + "rewards/margins": 0.10802390426397324, + "rewards/rejected": -0.08021771907806396, + "step": 12970 + }, + { + "epoch": 0.85, + "learning_rate": 3.382271533707043e-07, + "logits/chosen": -2.24385404586792, + "logits/rejected": -2.227750301361084, + "logps/chosen": -189.8992919921875, + "logps/rejected": -173.1783447265625, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01705920323729515, + "rewards/margins": 0.0616273507475853, + "rewards/rejected": -0.044568147510290146, + "step": 12980 + }, + { + "epoch": 0.85, + "learning_rate": 3.353648608618287e-07, + "logits/chosen": -2.3153414726257324, + "logits/rejected": -2.06382417678833, + "logps/chosen": -167.10702514648438, + "logps/rejected": -164.1302490234375, + "loss": 0.6895, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00764242559671402, + "rewards/margins": 0.0699392706155777, + "rewards/rejected": -0.06229684501886368, + "step": 12990 + }, + { + "epoch": 0.85, + "learning_rate": 3.3251386015019676e-07, + "logits/chosen": -2.3504998683929443, + "logits/rejected": -2.136061191558838, + "logps/chosen": -201.58786010742188, + "logps/rejected": -176.71900939941406, + "loss": 0.6886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.011715460568666458, + "rewards/margins": 0.0887567475438118, + "rewards/rejected": -0.07704129070043564, + "step": 13000 + }, + { + "epoch": 0.85, + "eval_logits/chosen": -2.3278446197509766, + "eval_logits/rejected": -2.138990640640259, + "eval_logps/chosen": -230.5150146484375, + "eval_logps/rejected": -218.6830596923828, + "eval_loss": 0.6897379159927368, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": 0.014899209141731262, + "eval_rewards/margins": 0.0856110006570816, + "eval_rewards/rejected": -0.07071178406476974, + "eval_runtime": 711.2988, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 13000 + }, + { + "epoch": 0.85, + "learning_rate": 3.296741661080255e-07, + "logits/chosen": -2.2785840034484863, + "logits/rejected": -2.186216115951538, + "logps/chosen": -232.2931671142578, + "logps/rejected": -235.34976196289062, + "loss": 0.688, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008452139794826508, + "rewards/margins": 0.09597768634557724, + "rewards/rejected": -0.08752553910017014, + "step": 13010 + }, + { + "epoch": 0.85, + "learning_rate": 3.2684579354854974e-07, + "logits/chosen": -2.3801522254943848, + "logits/rejected": -2.2735419273376465, + "logps/chosen": -294.13189697265625, + "logps/rejected": -312.67303466796875, + "loss": 0.6917, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.016732942312955856, + "rewards/margins": 0.08282653987407684, + "rewards/rejected": -0.099559485912323, + "step": 13020 + }, + { + "epoch": 0.85, + "learning_rate": 3.2402875722594653e-07, + "logits/chosen": -2.408092737197876, + "logits/rejected": -2.130885362625122, + "logps/chosen": -165.7288360595703, + "logps/rejected": -181.1701202392578, + "loss": 0.6893, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03316589817404747, + "rewards/margins": 0.09289722144603729, + "rewards/rejected": -0.05973132699728012, + "step": 13030 + }, + { + "epoch": 0.85, + "learning_rate": 3.212230718352566e-07, + "logits/chosen": -2.2618842124938965, + "logits/rejected": -2.2778449058532715, + "logps/chosen": -224.6171875, + "logps/rejected": -162.24813842773438, + "loss": 0.6932, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004326606169342995, + "rewards/margins": 0.013279316015541553, + "rewards/rejected": -0.008952709846198559, + "step": 13040 + }, + { + "epoch": 0.85, + "learning_rate": 3.1842875201231025e-07, + "logits/chosen": -2.3244717121124268, + "logits/rejected": -2.0360231399536133, + "logps/chosen": -220.89639282226562, + "logps/rejected": -202.62872314453125, + "loss": 0.6897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.008436797186732292, + "rewards/margins": 0.07507555186748505, + "rewards/rejected": -0.0666387528181076, + "step": 13050 + }, + { + "epoch": 0.85, + "learning_rate": 3.156458123336478e-07, + "logits/chosen": -2.1647417545318604, + "logits/rejected": -1.9881470203399658, + "logps/chosen": -156.07313537597656, + "logps/rejected": -158.6283721923828, + "loss": 0.6889, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.024387424811720848, + "rewards/margins": 0.12704019248485565, + "rewards/rejected": -0.10265277326107025, + "step": 13060 + }, + { + "epoch": 0.86, + "learning_rate": 3.128742673164459e-07, + "logits/chosen": -2.402404308319092, + "logits/rejected": -2.034379720687866, + "logps/chosen": -280.46630859375, + "logps/rejected": -245.130615234375, + "loss": 0.6902, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.024609588086605072, + "rewards/margins": 0.09180278331041336, + "rewards/rejected": -0.06719318777322769, + "step": 13070 + }, + { + "epoch": 0.86, + "learning_rate": 3.101141314184414e-07, + "logits/chosen": -2.511915683746338, + "logits/rejected": -2.2664966583251953, + "logps/chosen": -203.702880859375, + "logps/rejected": -200.3907928466797, + "loss": 0.6924, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.016816768795251846, + "rewards/margins": 0.058991938829422, + "rewards/rejected": -0.04217516630887985, + "step": 13080 + }, + { + "epoch": 0.86, + "learning_rate": 3.0736541903785526e-07, + "logits/chosen": -2.164177417755127, + "logits/rejected": -2.129770517349243, + "logps/chosen": -207.080322265625, + "logps/rejected": -264.8565368652344, + "loss": 0.6908, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.019345903769135475, + "rewards/margins": 0.08316637575626373, + "rewards/rejected": -0.06382046639919281, + "step": 13090 + }, + { + "epoch": 0.86, + "learning_rate": 3.0462814451331704e-07, + "logits/chosen": -2.218174934387207, + "logits/rejected": -2.0847418308258057, + "logps/chosen": -229.975341796875, + "logps/rejected": -235.6430206298828, + "loss": 0.6914, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.007010665722191334, + "rewards/margins": 0.05762631446123123, + "rewards/rejected": -0.05061563849449158, + "step": 13100 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -2.326002836227417, + "eval_logits/rejected": -2.1373050212860107, + "eval_logps/chosen": -230.6532745361328, + "eval_logps/rejected": -218.62350463867188, + "eval_loss": 0.6897422075271606, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": 0.013516743667423725, + "eval_rewards/margins": 0.0836327001452446, + "eval_rewards/rejected": -0.07011596858501434, + "eval_runtime": 712.647, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 13100 + }, + { + "epoch": 0.86, + "learning_rate": 3.019023221237927e-07, + "logits/chosen": -2.2801291942596436, + "logits/rejected": -2.1055219173431396, + "logps/chosen": -236.72677612304688, + "logps/rejected": -187.6962890625, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 5.53287572984118e-05, + "rewards/margins": 0.08536889404058456, + "rewards/rejected": -0.08531356602907181, + "step": 13110 + }, + { + "epoch": 0.86, + "learning_rate": 2.991879660885058e-07, + "logits/chosen": -2.4416868686676025, + "logits/rejected": -2.1827034950256348, + "logps/chosen": -261.5027770996094, + "logps/rejected": -253.5607147216797, + "loss": 0.6911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.020441105589270592, + "rewards/margins": 0.08418162912130356, + "rewards/rejected": -0.06374052911996841, + "step": 13120 + }, + { + "epoch": 0.86, + "learning_rate": 2.9648509056686786e-07, + "logits/chosen": -2.3484883308410645, + "logits/rejected": -2.1797919273376465, + "logps/chosen": -175.03335571289062, + "logps/rejected": -158.2578887939453, + "loss": 0.6874, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01908428594470024, + "rewards/margins": 0.08800263702869415, + "rewards/rejected": -0.06891834735870361, + "step": 13130 + }, + { + "epoch": 0.86, + "learning_rate": 2.937937096584012e-07, + "logits/chosen": -2.3359837532043457, + "logits/rejected": -2.114428758621216, + "logps/chosen": -287.9132385253906, + "logps/rejected": -234.6566619873047, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024876803159713745, + "rewards/margins": 0.07227373123168945, + "rewards/rejected": -0.04739692062139511, + "step": 13140 + }, + { + "epoch": 0.86, + "learning_rate": 2.9111383740266756e-07, + "logits/chosen": -2.134230136871338, + "logits/rejected": -2.000624179840088, + "logps/chosen": -234.1415557861328, + "logps/rejected": -235.3389434814453, + "loss": 0.6911, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.016426388174295425, + "rewards/margins": 0.06985460221767426, + "rewards/rejected": -0.05342821404337883, + "step": 13150 + }, + { + "epoch": 0.86, + "learning_rate": 2.8844548777919255e-07, + "logits/chosen": -2.374894618988037, + "logits/rejected": -2.0936150550842285, + "logps/chosen": -201.53781127929688, + "logps/rejected": -187.21641540527344, + "loss": 0.6889, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02346022054553032, + "rewards/margins": 0.0795883983373642, + "rewards/rejected": -0.056128181517124176, + "step": 13160 + }, + { + "epoch": 0.86, + "learning_rate": 2.8578867470739594e-07, + "logits/chosen": -2.2102534770965576, + "logits/rejected": -2.069348096847534, + "logps/chosen": -184.6317138671875, + "logps/rejected": -164.10693359375, + "loss": 0.6876, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.001433709287084639, + "rewards/margins": 0.1089792400598526, + "rewards/rejected": -0.11041294038295746, + "step": 13170 + }, + { + "epoch": 0.86, + "learning_rate": 2.8314341204651484e-07, + "logits/chosen": -2.4396963119506836, + "logits/rejected": -2.1671500205993652, + "logps/chosen": -273.39495849609375, + "logps/rejected": -211.6111602783203, + "loss": 0.6875, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.026441499590873718, + "rewards/margins": 0.1099875345826149, + "rewards/rejected": -0.08354604244232178, + "step": 13180 + }, + { + "epoch": 0.86, + "learning_rate": 2.805097135955362e-07, + "logits/chosen": -2.3553264141082764, + "logits/rejected": -2.145498037338257, + "logps/chosen": -210.2996368408203, + "logps/rejected": -186.16250610351562, + "loss": 0.6882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.022243741899728775, + "rewards/margins": 0.10788760334253311, + "rewards/rejected": -0.08564385771751404, + "step": 13190 + }, + { + "epoch": 0.86, + "learning_rate": 2.778875930931213e-07, + "logits/chosen": -2.3537840843200684, + "logits/rejected": -2.0290145874023438, + "logps/chosen": -233.2071075439453, + "logps/rejected": -225.6391143798828, + "loss": 0.6887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.019117634743452072, + "rewards/margins": 0.09794165939092636, + "rewards/rejected": -0.07882402837276459, + "step": 13200 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -2.3253469467163086, + "eval_logits/rejected": -2.136681318283081, + "eval_logps/chosen": -230.8813018798828, + "eval_logps/rejected": -218.95068359375, + "eval_loss": 0.6897428035736084, + "eval_rewards/accuracies": 0.6370000243186951, + "eval_rewards/chosen": 0.011236421763896942, + "eval_rewards/margins": 0.08462419360876083, + "eval_rewards/rejected": -0.0733877643942833, + "eval_runtime": 711.1811, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 13200 + }, + { + "epoch": 0.86, + "learning_rate": 2.7527706421753426e-07, + "logits/chosen": -2.320481777191162, + "logits/rejected": -2.259533643722534, + "logps/chosen": -198.22946166992188, + "logps/rejected": -209.1739044189453, + "loss": 0.6908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0017170917708426714, + "rewards/margins": 0.06455695629119873, + "rewards/rejected": -0.06283987313508987, + "step": 13210 + }, + { + "epoch": 0.86, + "learning_rate": 2.726781405865736e-07, + "logits/chosen": -2.4160947799682617, + "logits/rejected": -1.8638538122177124, + "logps/chosen": -302.37939453125, + "logps/rejected": -195.12802124023438, + "loss": 0.6901, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0028636218048632145, + "rewards/margins": 0.08517131209373474, + "rewards/rejected": -0.08230768889188766, + "step": 13220 + }, + { + "epoch": 0.87, + "learning_rate": 2.7009083575749687e-07, + "logits/chosen": -2.3083367347717285, + "logits/rejected": -2.20975661277771, + "logps/chosen": -243.7759246826172, + "logps/rejected": -248.42660522460938, + "loss": 0.6908, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0008438148652203381, + "rewards/margins": 0.06286215782165527, + "rewards/rejected": -0.062018342316150665, + "step": 13230 + }, + { + "epoch": 0.87, + "learning_rate": 2.6751516322695457e-07, + "logits/chosen": -2.380704402923584, + "logits/rejected": -2.3123157024383545, + "logps/chosen": -188.8930206298828, + "logps/rejected": -188.26637268066406, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.013505371287465096, + "rewards/margins": 0.054132528603076935, + "rewards/rejected": -0.04062715917825699, + "step": 13240 + }, + { + "epoch": 0.87, + "learning_rate": 2.649511364309154e-07, + "logits/chosen": -2.315520763397217, + "logits/rejected": -2.2880232334136963, + "logps/chosen": -201.6102294921875, + "logps/rejected": -189.54742431640625, + "loss": 0.6901, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.006794482469558716, + "rewards/margins": 0.08818355947732925, + "rewards/rejected": -0.08138908445835114, + "step": 13250 + }, + { + "epoch": 0.87, + "learning_rate": 2.6239876874460003e-07, + "logits/chosen": -2.4259033203125, + "logits/rejected": -2.2980501651763916, + "logps/chosen": -282.17254638671875, + "logps/rejected": -264.3243713378906, + "loss": 0.6878, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.022426238283514977, + "rewards/margins": 0.12707999348640442, + "rewards/rejected": -0.10465376079082489, + "step": 13260 + }, + { + "epoch": 0.87, + "learning_rate": 2.5985807348240744e-07, + "logits/chosen": -2.4714465141296387, + "logits/rejected": -2.0047621726989746, + "logps/chosen": -228.8441925048828, + "logps/rejected": -192.6537322998047, + "loss": 0.6876, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.034993596374988556, + "rewards/margins": 0.11756626516580582, + "rewards/rejected": -0.08257267624139786, + "step": 13270 + }, + { + "epoch": 0.87, + "learning_rate": 2.5732906389785014e-07, + "logits/chosen": -2.3537182807922363, + "logits/rejected": -2.1851718425750732, + "logps/chosen": -274.45111083984375, + "logps/rejected": -248.9810333251953, + "loss": 0.6862, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0325496569275856, + "rewards/margins": 0.12675470113754272, + "rewards/rejected": -0.09420505911111832, + "step": 13280 + }, + { + "epoch": 0.87, + "learning_rate": 2.5481175318347956e-07, + "logits/chosen": -2.2177436351776123, + "logits/rejected": -2.26359224319458, + "logps/chosen": -225.0997314453125, + "logps/rejected": -252.0758819580078, + "loss": 0.6895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.023978976532816887, + "rewards/margins": 0.09080135077238083, + "rewards/rejected": -0.06682237982749939, + "step": 13290 + }, + { + "epoch": 0.87, + "learning_rate": 2.5230615447082246e-07, + "logits/chosen": -2.3211140632629395, + "logits/rejected": -1.975968360900879, + "logps/chosen": -250.9895477294922, + "logps/rejected": -244.07345581054688, + "loss": 0.6891, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.003271388355642557, + "rewards/margins": 0.07869114726781845, + "rewards/rejected": -0.07541977614164352, + "step": 13300 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -2.324570894241333, + "eval_logits/rejected": -2.1359615325927734, + "eval_logps/chosen": -230.75726318359375, + "eval_logps/rejected": -218.9420623779297, + "eval_loss": 0.6897307634353638, + "eval_rewards/accuracies": 0.640500009059906, + "eval_rewards/chosen": 0.012476898729801178, + "eval_rewards/margins": 0.08577845990657806, + "eval_rewards/rejected": -0.07330156117677689, + "eval_runtime": 715.3491, + "eval_samples_per_second": 2.796, + "eval_steps_per_second": 1.398, + "step": 13300 + }, + { + "epoch": 0.87, + "learning_rate": 2.49812280830308e-07, + "logits/chosen": -2.3523142337799072, + "logits/rejected": -1.8849273920059204, + "logps/chosen": -222.9706268310547, + "logps/rejected": -205.4813995361328, + "loss": 0.6848, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.02066592127084732, + "rewards/margins": 0.17366810142993927, + "rewards/rejected": -0.15300217270851135, + "step": 13310 + }, + { + "epoch": 0.87, + "learning_rate": 2.4733014527120457e-07, + "logits/chosen": -2.2234625816345215, + "logits/rejected": -2.0490236282348633, + "logps/chosen": -206.6344451904297, + "logps/rejected": -194.33306884765625, + "loss": 0.6906, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04219576343894005, + "rewards/margins": 0.10591878741979599, + "rewards/rejected": -0.14811456203460693, + "step": 13320 + }, + { + "epoch": 0.87, + "learning_rate": 2.4485976074154565e-07, + "logits/chosen": -2.285674571990967, + "logits/rejected": -2.3419785499572754, + "logps/chosen": -208.6743927001953, + "logps/rejected": -240.1510009765625, + "loss": 0.6917, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.003236269112676382, + "rewards/margins": 0.01433448027819395, + "rewards/rejected": -0.011098211631178856, + "step": 13330 + }, + { + "epoch": 0.87, + "learning_rate": 2.4240114012806763e-07, + "logits/chosen": -2.310711622238159, + "logits/rejected": -2.287083148956299, + "logps/chosen": -205.61257934570312, + "logps/rejected": -194.80186462402344, + "loss": 0.6912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02185620740056038, + "rewards/margins": 0.06259147822856903, + "rewards/rejected": -0.04073526710271835, + "step": 13340 + }, + { + "epoch": 0.87, + "learning_rate": 2.399542962561399e-07, + "logits/chosen": -2.22048282623291, + "logits/rejected": -2.0561389923095703, + "logps/chosen": -222.4910125732422, + "logps/rejected": -185.62081909179688, + "loss": 0.6854, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.029649171978235245, + "rewards/margins": 0.10539106279611588, + "rewards/rejected": -0.07574189454317093, + "step": 13350 + }, + { + "epoch": 0.87, + "learning_rate": 2.3751924188969876e-07, + "logits/chosen": -2.2612175941467285, + "logits/rejected": -2.1327712535858154, + "logps/chosen": -249.13388061523438, + "logps/rejected": -239.14697265625, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.026046359911561012, + "rewards/margins": 0.09847725927829742, + "rewards/rejected": -0.07243090122938156, + "step": 13360 + }, + { + "epoch": 0.87, + "learning_rate": 2.3509598973118024e-07, + "logits/chosen": -2.448444366455078, + "logits/rejected": -2.249556064605713, + "logps/chosen": -219.6066436767578, + "logps/rejected": -162.51773071289062, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013987274840474129, + "rewards/margins": 0.05698896199464798, + "rewards/rejected": -0.043001689016819, + "step": 13370 + }, + { + "epoch": 0.88, + "learning_rate": 2.326845524214555e-07, + "logits/chosen": -2.1156511306762695, + "logits/rejected": -2.193941354751587, + "logps/chosen": -234.92654418945312, + "logps/rejected": -209.80361938476562, + "loss": 0.6924, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.001277850242331624, + "rewards/margins": 0.010018276050686836, + "rewards/rejected": -0.008740425109863281, + "step": 13380 + }, + { + "epoch": 0.88, + "learning_rate": 2.3028494253976158e-07, + "logits/chosen": -2.3883180618286133, + "logits/rejected": -2.1776084899902344, + "logps/chosen": -340.9706115722656, + "logps/rejected": -282.01092529296875, + "loss": 0.6907, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0029482836835086346, + "rewards/margins": 0.056216467171907425, + "rewards/rejected": -0.0532681830227375, + "step": 13390 + }, + { + "epoch": 0.88, + "learning_rate": 2.2789717260364026e-07, + "logits/chosen": -2.376600980758667, + "logits/rejected": -2.19740891456604, + "logps/chosen": -164.39688110351562, + "logps/rejected": -148.5932159423828, + "loss": 0.6913, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.007092096842825413, + "rewards/margins": 0.059240736067295074, + "rewards/rejected": -0.05214863270521164, + "step": 13400 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -2.3266842365264893, + "eval_logits/rejected": -2.1379003524780273, + "eval_logps/chosen": -230.48577880859375, + "eval_logps/rejected": -218.5886993408203, + "eval_loss": 0.6897343993186951, + "eval_rewards/accuracies": 0.6305000185966492, + "eval_rewards/chosen": 0.015191725455224514, + "eval_rewards/margins": 0.08495970070362091, + "eval_rewards/rejected": -0.06976797431707382, + "eval_runtime": 711.3263, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 13400 + }, + { + "epoch": 0.88, + "learning_rate": 2.255212550688682e-07, + "logits/chosen": -2.2746529579162598, + "logits/rejected": -2.3803787231445312, + "logps/chosen": -216.0994110107422, + "logps/rejected": -289.5954895019531, + "loss": 0.6881, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.018555883318185806, + "rewards/margins": 0.09309352934360504, + "rewards/rejected": -0.07453764975070953, + "step": 13410 + }, + { + "epoch": 0.88, + "learning_rate": 2.2315720232939598e-07, + "logits/chosen": -2.6417040824890137, + "logits/rejected": -2.17865252494812, + "logps/chosen": -258.8695373535156, + "logps/rejected": -180.49835205078125, + "loss": 0.6896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03244171291589737, + "rewards/margins": 0.1123114600777626, + "rewards/rejected": -0.07986976206302643, + "step": 13420 + }, + { + "epoch": 0.88, + "learning_rate": 2.2080502671727956e-07, + "logits/chosen": -2.428260087966919, + "logits/rejected": -2.08381724357605, + "logps/chosen": -218.2057342529297, + "logps/rejected": -204.05075073242188, + "loss": 0.6889, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.018760915845632553, + "rewards/margins": 0.07672649621963501, + "rewards/rejected": -0.05796556919813156, + "step": 13430 + }, + { + "epoch": 0.88, + "learning_rate": 2.1846474050262078e-07, + "logits/chosen": -2.3987889289855957, + "logits/rejected": -2.263538360595703, + "logps/chosen": -243.8972930908203, + "logps/rejected": -184.60328674316406, + "loss": 0.6902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.026634354144334793, + "rewards/margins": 0.06503110378980637, + "rewards/rejected": -0.03839675337076187, + "step": 13440 + }, + { + "epoch": 0.88, + "learning_rate": 2.1613635589349756e-07, + "logits/chosen": -1.9668877124786377, + "logits/rejected": -2.058589458465576, + "logps/chosen": -194.289306640625, + "logps/rejected": -232.1760711669922, + "loss": 0.6901, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.028739606961607933, + "rewards/margins": 0.10366035997867584, + "rewards/rejected": -0.07492075115442276, + "step": 13450 + }, + { + "epoch": 0.88, + "learning_rate": 2.1381988503590578e-07, + "logits/chosen": -2.0607008934020996, + "logits/rejected": -2.126495599746704, + "logps/chosen": -213.87832641601562, + "logps/rejected": -218.08837890625, + "loss": 0.6896, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.019935574382543564, + "rewards/margins": 0.10359902679920197, + "rewards/rejected": -0.0836634561419487, + "step": 13460 + }, + { + "epoch": 0.88, + "learning_rate": 2.11515340013691e-07, + "logits/chosen": -2.41133975982666, + "logits/rejected": -2.3844552040100098, + "logps/chosen": -228.10009765625, + "logps/rejected": -219.16781616210938, + "loss": 0.6883, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.030724655836820602, + "rewards/margins": 0.12647958099842072, + "rewards/rejected": -0.09575492143630981, + "step": 13470 + }, + { + "epoch": 0.88, + "learning_rate": 2.092227328484897e-07, + "logits/chosen": -2.187948226928711, + "logits/rejected": -2.1468586921691895, + "logps/chosen": -202.74105834960938, + "logps/rejected": -243.099609375, + "loss": 0.6876, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.012676420621573925, + "rewards/margins": 0.09701034426689148, + "rewards/rejected": -0.08433392643928528, + "step": 13480 + }, + { + "epoch": 0.88, + "learning_rate": 2.0694207549966345e-07, + "logits/chosen": -2.2124342918395996, + "logits/rejected": -2.0843167304992676, + "logps/chosen": -208.91720581054688, + "logps/rejected": -197.19125366210938, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.745789818931371e-05, + "rewards/margins": 0.051435112953186035, + "rewards/rejected": -0.05152256414294243, + "step": 13490 + }, + { + "epoch": 0.88, + "learning_rate": 2.0467337986423864e-07, + "logits/chosen": -2.462228298187256, + "logits/rejected": -2.1654422283172607, + "logps/chosen": -302.31915283203125, + "logps/rejected": -275.478759765625, + "loss": 0.6912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0340435728430748, + "rewards/margins": 0.06372375041246414, + "rewards/rejected": -0.029680173844099045, + "step": 13500 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -2.326474189758301, + "eval_logits/rejected": -2.1377792358398438, + "eval_logps/chosen": -230.06185913085938, + "eval_logps/rejected": -218.02516174316406, + "eval_loss": 0.6897424459457397, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": 0.019430968910455704, + "eval_rewards/margins": 0.0835636630654335, + "eval_rewards/rejected": -0.0641326829791069, + "eval_runtime": 712.2388, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 13500 + }, + { + "epoch": 0.88, + "learning_rate": 2.0241665777684272e-07, + "logits/chosen": -2.3777573108673096, + "logits/rejected": -2.2866101264953613, + "logps/chosen": -265.6940002441406, + "logps/rejected": -234.2218017578125, + "loss": 0.6878, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.025989696383476257, + "rewards/margins": 0.12143020331859589, + "rewards/rejected": -0.09544049948453903, + "step": 13510 + }, + { + "epoch": 0.88, + "learning_rate": 2.0017192100964366e-07, + "logits/chosen": -2.0364346504211426, + "logits/rejected": -2.1206259727478027, + "logps/chosen": -199.53173828125, + "logps/rejected": -210.08749389648438, + "loss": 0.6913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.004242539405822754, + "rewards/margins": 0.08438628911972046, + "rewards/rejected": -0.08014374226331711, + "step": 13520 + }, + { + "epoch": 0.89, + "learning_rate": 1.9793918127228777e-07, + "logits/chosen": -2.411618232727051, + "logits/rejected": -2.04459547996521, + "logps/chosen": -314.3971862792969, + "logps/rejected": -269.76873779296875, + "loss": 0.688, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.019022373482584953, + "rewards/margins": 0.09453563392162323, + "rewards/rejected": -0.07551325857639313, + "step": 13530 + }, + { + "epoch": 0.89, + "learning_rate": 1.9571845021184005e-07, + "logits/chosen": -2.174851894378662, + "logits/rejected": -2.0823864936828613, + "logps/chosen": -232.1296844482422, + "logps/rejected": -243.97640991210938, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011437867768108845, + "rewards/margins": 0.07728725671768188, + "rewards/rejected": -0.08872512727975845, + "step": 13540 + }, + { + "epoch": 0.89, + "learning_rate": 1.9350973941272027e-07, + "logits/chosen": -2.308931350708008, + "logits/rejected": -2.2825989723205566, + "logps/chosen": -203.7218017578125, + "logps/rejected": -191.3800506591797, + "loss": 0.6893, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0005322685465216637, + "rewards/margins": 0.08159051835536957, + "rewards/rejected": -0.08105824887752533, + "step": 13550 + }, + { + "epoch": 0.89, + "learning_rate": 1.9131306039664676e-07, + "logits/chosen": -2.1821532249450684, + "logits/rejected": -2.1290283203125, + "logps/chosen": -198.3598175048828, + "logps/rejected": -232.4375457763672, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.015442472882568836, + "rewards/margins": 0.09721332788467407, + "rewards/rejected": -0.08177085965871811, + "step": 13560 + }, + { + "epoch": 0.89, + "learning_rate": 1.8912842462257358e-07, + "logits/chosen": -2.2131478786468506, + "logits/rejected": -2.144645929336548, + "logps/chosen": -217.9169464111328, + "logps/rejected": -212.69229125976562, + "loss": 0.6869, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.022523250430822372, + "rewards/margins": 0.11762279272079468, + "rewards/rejected": -0.0950995534658432, + "step": 13570 + }, + { + "epoch": 0.89, + "learning_rate": 1.869558434866303e-07, + "logits/chosen": -2.2858431339263916, + "logits/rejected": -2.352550745010376, + "logps/chosen": -180.52920532226562, + "logps/rejected": -207.8330078125, + "loss": 0.6878, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0056061758659780025, + "rewards/margins": 0.0927395448088646, + "rewards/rejected": -0.08713337033987045, + "step": 13580 + }, + { + "epoch": 0.89, + "learning_rate": 1.847953283220652e-07, + "logits/chosen": -2.4596104621887207, + "logits/rejected": -2.134106397628784, + "logps/chosen": -252.9336700439453, + "logps/rejected": -188.153076171875, + "loss": 0.6859, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03176042437553406, + "rewards/margins": 0.1385561227798462, + "rewards/rejected": -0.10679570585489273, + "step": 13590 + }, + { + "epoch": 0.89, + "learning_rate": 1.8264689039918265e-07, + "logits/chosen": -2.424584150314331, + "logits/rejected": -2.084427833557129, + "logps/chosen": -258.3966369628906, + "logps/rejected": -238.4198760986328, + "loss": 0.6905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.014337467029690742, + "rewards/margins": 0.07449439913034439, + "rewards/rejected": -0.0601569339632988, + "step": 13600 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -2.3269259929656982, + "eval_logits/rejected": -2.138150930404663, + "eval_logps/chosen": -230.37106323242188, + "eval_logps/rejected": -218.5099639892578, + "eval_loss": 0.6897370219230652, + "eval_rewards/accuracies": 0.6380000114440918, + "eval_rewards/chosen": 0.016339082270860672, + "eval_rewards/margins": 0.08531977236270905, + "eval_rewards/rejected": -0.06898068636655807, + "eval_runtime": 711.7818, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 13600 + }, + { + "epoch": 0.89, + "learning_rate": 1.8051054092528857e-07, + "logits/chosen": -2.351792573928833, + "logits/rejected": -2.163576602935791, + "logps/chosen": -257.79010009765625, + "logps/rejected": -259.30328369140625, + "loss": 0.6886, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03311315178871155, + "rewards/margins": 0.11273415386676788, + "rewards/rejected": -0.07962099462747574, + "step": 13610 + }, + { + "epoch": 0.89, + "learning_rate": 1.783862910446271e-07, + "logits/chosen": -1.985805869102478, + "logits/rejected": -2.1247270107269287, + "logps/chosen": -171.29833984375, + "logps/rejected": -174.77696228027344, + "loss": 0.6884, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.01554956752806902, + "rewards/margins": 0.12793493270874023, + "rewards/rejected": -0.1123853549361229, + "step": 13620 + }, + { + "epoch": 0.89, + "learning_rate": 1.762741518383271e-07, + "logits/chosen": -2.367798328399658, + "logits/rejected": -2.2053184509277344, + "logps/chosen": -220.3152618408203, + "logps/rejected": -201.3122100830078, + "loss": 0.6882, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.015171055682003498, + "rewards/margins": 0.08661060780286789, + "rewards/rejected": -0.07143954932689667, + "step": 13630 + }, + { + "epoch": 0.89, + "learning_rate": 1.7417413432434082e-07, + "logits/chosen": -2.435668468475342, + "logits/rejected": -2.041511058807373, + "logps/chosen": -252.56173706054688, + "logps/rejected": -206.9319305419922, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0009791527409106493, + "rewards/margins": 0.07137512415647507, + "rewards/rejected": -0.07039596140384674, + "step": 13640 + }, + { + "epoch": 0.89, + "learning_rate": 1.7208624945738855e-07, + "logits/chosen": -2.435779571533203, + "logits/rejected": -2.284393310546875, + "logps/chosen": -216.6245574951172, + "logps/rejected": -228.45632934570312, + "loss": 0.6927, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.014597500674426556, + "rewards/margins": 0.04394357651472092, + "rewards/rejected": -0.02934606932103634, + "step": 13650 + }, + { + "epoch": 0.89, + "learning_rate": 1.7001050812889995e-07, + "logits/chosen": -2.412449598312378, + "logits/rejected": -2.0942466259002686, + "logps/chosen": -256.7345886230469, + "logps/rejected": -224.16006469726562, + "loss": 0.6905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01201794296503067, + "rewards/margins": 0.0767635852098465, + "rewards/rejected": -0.08878152817487717, + "step": 13660 + }, + { + "epoch": 0.89, + "learning_rate": 1.679469211669596e-07, + "logits/chosen": -2.3339667320251465, + "logits/rejected": -2.177741527557373, + "logps/chosen": -224.02359008789062, + "logps/rejected": -183.2321319580078, + "loss": 0.6878, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009215526282787323, + "rewards/margins": 0.10812918096780777, + "rewards/rejected": -0.09891365468502045, + "step": 13670 + }, + { + "epoch": 0.9, + "learning_rate": 1.6589549933624715e-07, + "logits/chosen": -2.3178319931030273, + "logits/rejected": -2.134033679962158, + "logps/chosen": -236.70999145507812, + "logps/rejected": -199.88436889648438, + "loss": 0.6859, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03497043251991272, + "rewards/margins": 0.14659801125526428, + "rewards/rejected": -0.11162757873535156, + "step": 13680 + }, + { + "epoch": 0.9, + "learning_rate": 1.638562533379845e-07, + "logits/chosen": -2.3221664428710938, + "logits/rejected": -2.150352954864502, + "logps/chosen": -253.377197265625, + "logps/rejected": -197.33758544921875, + "loss": 0.6902, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02325398102402687, + "rewards/margins": 0.062283407896757126, + "rewards/rejected": -0.039029426872730255, + "step": 13690 + }, + { + "epoch": 0.9, + "learning_rate": 1.6182919380987676e-07, + "logits/chosen": -2.365807056427002, + "logits/rejected": -2.283456325531006, + "logps/chosen": -224.37783813476562, + "logps/rejected": -214.51156616210938, + "loss": 0.6913, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.013401429168879986, + "rewards/margins": 0.055108923465013504, + "rewards/rejected": -0.041707489639520645, + "step": 13700 + }, + { + "epoch": 0.9, + "eval_logits/chosen": -2.3265719413757324, + "eval_logits/rejected": -2.1378707885742188, + "eval_logps/chosen": -230.2803497314453, + "eval_logps/rejected": -218.3449249267578, + "eval_loss": 0.6897291541099548, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": 0.017245886847376823, + "eval_rewards/margins": 0.08457593619823456, + "eval_rewards/rejected": -0.06733004748821259, + "eval_runtime": 711.018, + "eval_samples_per_second": 2.813, + "eval_steps_per_second": 1.406, + "step": 13700 + }, + { + "epoch": 0.9, + "learning_rate": 1.598143313260603e-07, + "logits/chosen": -2.2812981605529785, + "logits/rejected": -2.1852867603302, + "logps/chosen": -186.65716552734375, + "logps/rejected": -181.11973571777344, + "loss": 0.6905, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.013965973630547523, + "rewards/margins": 0.0647711455821991, + "rewards/rejected": -0.05080517381429672, + "step": 13710 + }, + { + "epoch": 0.9, + "learning_rate": 1.5781167639704415e-07, + "logits/chosen": -2.5151546001434326, + "logits/rejected": -1.9689128398895264, + "logps/chosen": -329.40081787109375, + "logps/rejected": -211.47140502929688, + "loss": 0.6913, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.027660047635436058, + "rewards/margins": 0.0631406158208847, + "rewards/rejected": -0.0354805663228035, + "step": 13720 + }, + { + "epoch": 0.9, + "learning_rate": 1.5582123946965787e-07, + "logits/chosen": -2.1882617473602295, + "logits/rejected": -2.0734565258026123, + "logps/chosen": -231.71762084960938, + "logps/rejected": -250.3955535888672, + "loss": 0.6892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.026773914694786072, + "rewards/margins": 0.08530018478631973, + "rewards/rejected": -0.05852626636624336, + "step": 13730 + }, + { + "epoch": 0.9, + "learning_rate": 1.5384303092699504e-07, + "logits/chosen": -2.391803503036499, + "logits/rejected": -2.1646900177001953, + "logps/chosen": -284.1597900390625, + "logps/rejected": -295.56170654296875, + "loss": 0.6888, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02824859321117401, + "rewards/margins": 0.13083064556121826, + "rewards/rejected": -0.10258202254772186, + "step": 13740 + }, + { + "epoch": 0.9, + "learning_rate": 1.518770610883613e-07, + "logits/chosen": -2.2725253105163574, + "logits/rejected": -2.001561403274536, + "logps/chosen": -215.4679412841797, + "logps/rejected": -199.0486297607422, + "loss": 0.6887, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.009293107315897942, + "rewards/margins": 0.12994512915611267, + "rewards/rejected": -0.13923820853233337, + "step": 13750 + }, + { + "epoch": 0.9, + "learning_rate": 1.4992334020921735e-07, + "logits/chosen": -2.275468111038208, + "logits/rejected": -2.1718432903289795, + "logps/chosen": -168.7460174560547, + "logps/rejected": -151.58731079101562, + "loss": 0.688, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04226404055953026, + "rewards/margins": 0.12455103546380997, + "rewards/rejected": -0.08228699862957001, + "step": 13760 + }, + { + "epoch": 0.9, + "learning_rate": 1.4798187848112905e-07, + "logits/chosen": -2.203535556793213, + "logits/rejected": -2.2031960487365723, + "logps/chosen": -224.80953979492188, + "logps/rejected": -193.4915313720703, + "loss": 0.6885, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.018388142809271812, + "rewards/margins": 0.09539445489645004, + "rewards/rejected": -0.113782599568367, + "step": 13770 + }, + { + "epoch": 0.9, + "learning_rate": 1.460526860317113e-07, + "logits/chosen": -2.3986401557922363, + "logits/rejected": -2.332296848297119, + "logps/chosen": -173.04156494140625, + "logps/rejected": -216.95974731445312, + "loss": 0.686, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0024310871958732605, + "rewards/margins": 0.12171381711959839, + "rewards/rejected": -0.11928270757198334, + "step": 13780 + }, + { + "epoch": 0.9, + "learning_rate": 1.441357729245771e-07, + "logits/chosen": -2.5229907035827637, + "logits/rejected": -1.9477760791778564, + "logps/chosen": -248.5333709716797, + "logps/rejected": -202.63084411621094, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0054153092205524445, + "rewards/margins": 0.10084688663482666, + "rewards/rejected": -0.10626220703125, + "step": 13790 + }, + { + "epoch": 0.9, + "learning_rate": 1.4223114915928482e-07, + "logits/chosen": -2.1714558601379395, + "logits/rejected": -1.9200232028961182, + "logps/chosen": -225.0757293701172, + "logps/rejected": -238.7044219970703, + "loss": 0.69, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.008468665182590485, + "rewards/margins": 0.06861492991447449, + "rewards/rejected": -0.060146261006593704, + "step": 13800 + }, + { + "epoch": 0.9, + "eval_logits/chosen": -2.3265576362609863, + "eval_logits/rejected": -2.137892961502075, + "eval_logps/chosen": -230.2596893310547, + "eval_logps/rejected": -218.37973022460938, + "eval_loss": 0.6897318363189697, + "eval_rewards/accuracies": 0.6389999985694885, + "eval_rewards/chosen": 0.017452586442232132, + "eval_rewards/margins": 0.08513098210096359, + "eval_rewards/rejected": -0.06767839938402176, + "eval_runtime": 712.4298, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 13800 + }, + { + "epoch": 0.9, + "learning_rate": 1.403388246712842e-07, + "logits/chosen": -2.2311573028564453, + "logits/rejected": -1.99333918094635, + "logps/chosen": -164.9827423095703, + "logps/rejected": -163.3975830078125, + "loss": 0.6903, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00731696467846632, + "rewards/margins": 0.055666130036115646, + "rewards/rejected": -0.06298309564590454, + "step": 13810 + }, + { + "epoch": 0.9, + "learning_rate": 1.3845880933186757e-07, + "logits/chosen": -2.4922029972076416, + "logits/rejected": -2.229177951812744, + "logps/chosen": -237.0489959716797, + "logps/rejected": -206.5233154296875, + "loss": 0.6922, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.01689792238175869, + "rewards/margins": 0.037102360278367996, + "rewards/rejected": -0.020204436033964157, + "step": 13820 + }, + { + "epoch": 0.9, + "learning_rate": 1.3659111294811457e-07, + "logits/chosen": -2.3271474838256836, + "logits/rejected": -2.1840929985046387, + "logps/chosen": -193.53775024414062, + "logps/rejected": -181.85487365722656, + "loss": 0.6909, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016945457085967064, + "rewards/margins": 0.062133751809597015, + "rewards/rejected": -0.07907922565937042, + "step": 13830 + }, + { + "epoch": 0.91, + "learning_rate": 1.347357452628459e-07, + "logits/chosen": -2.4782280921936035, + "logits/rejected": -2.344364643096924, + "logps/chosen": -241.06307983398438, + "logps/rejected": -240.8955078125, + "loss": 0.6911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04234471544623375, + "rewards/margins": 0.07334139943122864, + "rewards/rejected": -0.030996689572930336, + "step": 13840 + }, + { + "epoch": 0.91, + "learning_rate": 1.3289271595456732e-07, + "logits/chosen": -2.2512059211730957, + "logits/rejected": -2.064351797103882, + "logps/chosen": -204.22561645507812, + "logps/rejected": -189.5396270751953, + "loss": 0.6878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0010356527054682374, + "rewards/margins": 0.1110767275094986, + "rewards/rejected": -0.1100410670042038, + "step": 13850 + }, + { + "epoch": 0.91, + "learning_rate": 1.310620346374228e-07, + "logits/chosen": -2.1624104976654053, + "logits/rejected": -2.0331850051879883, + "logps/chosen": -229.8220672607422, + "logps/rejected": -205.7158660888672, + "loss": 0.6867, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0239988062530756, + "rewards/margins": 0.12696941196918488, + "rewards/rejected": -0.10297061502933502, + "step": 13860 + }, + { + "epoch": 0.91, + "learning_rate": 1.2924371086114274e-07, + "logits/chosen": -2.213139057159424, + "logits/rejected": -1.9702332019805908, + "logps/chosen": -234.7813262939453, + "logps/rejected": -237.0749053955078, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01280174870043993, + "rewards/margins": 0.0832042545080185, + "rewards/rejected": -0.07040251046419144, + "step": 13870 + }, + { + "epoch": 0.91, + "learning_rate": 1.274377541109953e-07, + "logits/chosen": -2.175238847732544, + "logits/rejected": -2.2496845722198486, + "logps/chosen": -163.90567016601562, + "logps/rejected": -248.86221313476562, + "loss": 0.6888, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.004448303487151861, + "rewards/margins": 0.07032819092273712, + "rewards/rejected": -0.06587988883256912, + "step": 13880 + }, + { + "epoch": 0.91, + "learning_rate": 1.2564417380773435e-07, + "logits/chosen": -2.1063437461853027, + "logits/rejected": -1.9700400829315186, + "logps/chosen": -177.04061889648438, + "logps/rejected": -210.00146484375, + "loss": 0.6885, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.014564013108611107, + "rewards/margins": 0.10478832572698593, + "rewards/rejected": -0.09022431075572968, + "step": 13890 + }, + { + "epoch": 0.91, + "learning_rate": 1.2386297930755436e-07, + "logits/chosen": -2.3223581314086914, + "logits/rejected": -2.3294119834899902, + "logps/chosen": -249.2999267578125, + "logps/rejected": -253.54208374023438, + "loss": 0.6902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.014141452498733997, + "rewards/margins": 0.0906859040260315, + "rewards/rejected": -0.10482735931873322, + "step": 13900 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -2.3256688117980957, + "eval_logits/rejected": -2.137094497680664, + "eval_logps/chosen": -230.19509887695312, + "eval_logps/rejected": -218.29586791992188, + "eval_loss": 0.6897297501564026, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": 0.01809842139482498, + "eval_rewards/margins": 0.08493825048208237, + "eval_rewards/rejected": -0.06683983653783798, + "eval_runtime": 714.4928, + "eval_samples_per_second": 2.799, + "eval_steps_per_second": 1.4, + "step": 13900 + }, + { + "epoch": 0.91, + "learning_rate": 1.220941799020378e-07, + "logits/chosen": -2.109891176223755, + "logits/rejected": -2.0245959758758545, + "logps/chosen": -217.385498046875, + "logps/rejected": -203.8630828857422, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.016956061124801636, + "rewards/margins": 0.08830462396144867, + "rewards/rejected": -0.07134857028722763, + "step": 13910 + }, + { + "epoch": 0.91, + "learning_rate": 1.2033778481810975e-07, + "logits/chosen": -2.38193416595459, + "logits/rejected": -2.114203453063965, + "logps/chosen": -217.64212036132812, + "logps/rejected": -189.1055145263672, + "loss": 0.6876, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.02351401373744011, + "rewards/margins": 0.08809584379196167, + "rewards/rejected": -0.06458182632923126, + "step": 13920 + }, + { + "epoch": 0.91, + "learning_rate": 1.1859380321798591e-07, + "logits/chosen": -2.3214306831359863, + "logits/rejected": -2.388051748275757, + "logps/chosen": -200.78286743164062, + "logps/rejected": -222.736083984375, + "loss": 0.6887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.017784133553504944, + "rewards/margins": 0.07208283245563507, + "rewards/rejected": -0.05429869890213013, + "step": 13930 + }, + { + "epoch": 0.91, + "learning_rate": 1.1686224419912989e-07, + "logits/chosen": -2.2252655029296875, + "logits/rejected": -2.0139126777648926, + "logps/chosen": -251.2513885498047, + "logps/rejected": -235.0546112060547, + "loss": 0.6871, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.016212433576583862, + "rewards/margins": 0.13094016909599304, + "rewards/rejected": -0.11472772061824799, + "step": 13940 + }, + { + "epoch": 0.91, + "learning_rate": 1.1514311679420104e-07, + "logits/chosen": -2.0391550064086914, + "logits/rejected": -2.1126651763916016, + "logps/chosen": -154.56124877929688, + "logps/rejected": -220.6207733154297, + "loss": 0.6874, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.005872879642993212, + "rewards/margins": 0.10239820182323456, + "rewards/rejected": -0.09652532637119293, + "step": 13950 + }, + { + "epoch": 0.91, + "learning_rate": 1.1343642997101029e-07, + "logits/chosen": -2.3179874420166016, + "logits/rejected": -2.2247185707092285, + "logps/chosen": -199.3680877685547, + "logps/rejected": -196.05459594726562, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.029852483421564102, + "rewards/margins": 0.09908358752727509, + "rewards/rejected": -0.06923110783100128, + "step": 13960 + }, + { + "epoch": 0.91, + "learning_rate": 1.1174219263247188e-07, + "logits/chosen": -2.0668439865112305, + "logits/rejected": -1.9567053318023682, + "logps/chosen": -198.8998260498047, + "logps/rejected": -190.6134033203125, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.000822742294985801, + "rewards/margins": 0.0924471914768219, + "rewards/rejected": -0.09326992928981781, + "step": 13970 + }, + { + "epoch": 0.91, + "learning_rate": 1.1006041361655839e-07, + "logits/chosen": -2.4929490089416504, + "logits/rejected": -2.028357982635498, + "logps/chosen": -208.04507446289062, + "logps/rejected": -175.75112915039062, + "loss": 0.6898, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.003921913914382458, + "rewards/margins": 0.07643552869558334, + "rewards/rejected": -0.07251361012458801, + "step": 13980 + }, + { + "epoch": 0.92, + "learning_rate": 1.0839110169625189e-07, + "logits/chosen": -2.05533504486084, + "logits/rejected": -2.352057933807373, + "logps/chosen": -205.25399780273438, + "logps/rejected": -211.28079223632812, + "loss": 0.6875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.015882687643170357, + "rewards/margins": 0.133821040391922, + "rewards/rejected": -0.1179383248090744, + "step": 13990 + }, + { + "epoch": 0.92, + "learning_rate": 1.06734265579502e-07, + "logits/chosen": -2.337198495864868, + "logits/rejected": -2.033975124359131, + "logps/chosen": -262.0823974609375, + "logps/rejected": -206.0341796875, + "loss": 0.6883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.005657001864165068, + "rewards/margins": 0.10456991195678711, + "rewards/rejected": -0.11022691428661346, + "step": 14000 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -2.326231002807617, + "eval_logits/rejected": -2.137554168701172, + "eval_logps/chosen": -230.58172607421875, + "eval_logps/rejected": -218.70066833496094, + "eval_loss": 0.689732551574707, + "eval_rewards/accuracies": 0.6380000114440918, + "eval_rewards/chosen": 0.01423216424882412, + "eval_rewards/margins": 0.0851198136806488, + "eval_rewards/rejected": -0.07088765501976013, + "eval_runtime": 712.0104, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 1.404, + "step": 14000 + }, + { + "epoch": 0.92, + "learning_rate": 1.050899139091771e-07, + "logits/chosen": -2.416721820831299, + "logits/rejected": -2.0669798851013184, + "logps/chosen": -279.1082763671875, + "logps/rejected": -241.64761352539062, + "loss": 0.6898, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0048080976121127605, + "rewards/margins": 0.09057492017745972, + "rewards/rejected": -0.09538300335407257, + "step": 14010 + }, + { + "epoch": 0.92, + "learning_rate": 1.0345805526302072e-07, + "logits/chosen": -2.242600917816162, + "logits/rejected": -2.3467628955841064, + "logps/chosen": -198.83761596679688, + "logps/rejected": -201.98965454101562, + "loss": 0.6904, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.023247262462973595, + "rewards/margins": 0.10389737784862518, + "rewards/rejected": -0.08065011352300644, + "step": 14020 + }, + { + "epoch": 0.92, + "learning_rate": 1.0183869815360764e-07, + "logits/chosen": -2.231644868850708, + "logits/rejected": -2.350635290145874, + "logps/chosen": -189.2916717529297, + "logps/rejected": -228.9336700439453, + "loss": 0.6914, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02621796727180481, + "rewards/margins": 0.07001911103725433, + "rewards/rejected": -0.04380114749073982, + "step": 14030 + }, + { + "epoch": 0.92, + "learning_rate": 1.0023185102829763e-07, + "logits/chosen": -2.0455517768859863, + "logits/rejected": -2.2643027305603027, + "logps/chosen": -228.81332397460938, + "logps/rejected": -242.60061645507812, + "loss": 0.6898, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.020088955760002136, + "rewards/margins": 0.09602537006139755, + "rewards/rejected": -0.07593640685081482, + "step": 14040 + }, + { + "epoch": 0.92, + "learning_rate": 9.863752226919182e-08, + "logits/chosen": -2.2488272190093994, + "logits/rejected": -1.7611221075057983, + "logps/chosen": -235.35617065429688, + "logps/rejected": -182.02853393554688, + "loss": 0.6864, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02376001887023449, + "rewards/margins": 0.12250945717096329, + "rewards/rejected": -0.09874944388866425, + "step": 14050 + }, + { + "epoch": 0.92, + "learning_rate": 9.705572019309107e-08, + "logits/chosen": -2.169804096221924, + "logits/rejected": -2.2068305015563965, + "logps/chosen": -266.7760925292969, + "logps/rejected": -246.24984741210938, + "loss": 0.6878, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.012513126246631145, + "rewards/margins": 0.10957686603069305, + "rewards/rejected": -0.09706376492977142, + "step": 14060 + }, + { + "epoch": 0.92, + "learning_rate": 9.548645305144849e-08, + "logits/chosen": -2.3847720623016357, + "logits/rejected": -2.2535669803619385, + "logps/chosen": -171.47378540039062, + "logps/rejected": -184.33763122558594, + "loss": 0.6876, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020213961601257324, + "rewards/margins": 0.08890150487422943, + "rewards/rejected": -0.0686875432729721, + "step": 14070 + }, + { + "epoch": 0.92, + "learning_rate": 9.392972903033149e-08, + "logits/chosen": -2.306051254272461, + "logits/rejected": -2.159519672393799, + "logps/chosen": -222.64682006835938, + "logps/rejected": -218.47372436523438, + "loss": 0.6924, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.004844355396926403, + "rewards/margins": 0.03370397537946701, + "rewards/rejected": -0.028859620913863182, + "step": 14080 + }, + { + "epoch": 0.92, + "learning_rate": 9.238555625037449e-08, + "logits/chosen": -2.328538417816162, + "logits/rejected": -2.1120870113372803, + "logps/chosen": -188.9663848876953, + "logps/rejected": -166.45822143554688, + "loss": 0.6903, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.017418090254068375, + "rewards/margins": 0.0717364102602005, + "rewards/rejected": -0.05431831628084183, + "step": 14090 + }, + { + "epoch": 0.92, + "learning_rate": 9.085394276673903e-08, + "logits/chosen": -2.3301329612731934, + "logits/rejected": -2.0303778648376465, + "logps/chosen": -267.52117919921875, + "logps/rejected": -261.9759216308594, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006449407432228327, + "rewards/margins": 0.0890425592660904, + "rewards/rejected": -0.0825931578874588, + "step": 14100 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -2.325167417526245, + "eval_logits/rejected": -2.1365697383880615, + "eval_logps/chosen": -230.42181396484375, + "eval_logps/rejected": -218.4661865234375, + "eval_loss": 0.6897357106208801, + "eval_rewards/accuracies": 0.637499988079071, + "eval_rewards/chosen": 0.0158314798027277, + "eval_rewards/margins": 0.08437444269657135, + "eval_rewards/rejected": -0.0685429498553276, + "eval_runtime": 710.8403, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 14100 + }, + { + "epoch": 0.92, + "learning_rate": 8.933489656907157e-08, + "logits/chosen": -2.308310031890869, + "logits/rejected": -2.2029194831848145, + "logps/chosen": -217.09829711914062, + "logps/rejected": -247.48666381835938, + "loss": 0.6915, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00237136147916317, + "rewards/margins": 0.05447987839579582, + "rewards/rejected": -0.0521085187792778, + "step": 14110 + }, + { + "epoch": 0.92, + "learning_rate": 8.782842558146127e-08, + "logits/chosen": -2.3630995750427246, + "logits/rejected": -2.2901546955108643, + "logps/chosen": -159.1778564453125, + "logps/rejected": -167.31874084472656, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02706066146492958, + "rewards/margins": 0.09847380220890045, + "rewards/rejected": -0.07141314446926117, + "step": 14120 + }, + { + "epoch": 0.92, + "learning_rate": 8.633453766239836e-08, + "logits/chosen": -2.415795087814331, + "logits/rejected": -2.186148166656494, + "logps/chosen": -230.60061645507812, + "logps/rejected": -205.28359985351562, + "loss": 0.6916, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04520539939403534, + "rewards/margins": 0.06538231670856476, + "rewards/rejected": -0.020176919177174568, + "step": 14130 + }, + { + "epoch": 0.93, + "learning_rate": 8.485324060473448e-08, + "logits/chosen": -2.249516248703003, + "logits/rejected": -2.1395606994628906, + "logps/chosen": -237.13107299804688, + "logps/rejected": -232.41909790039062, + "loss": 0.6906, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.031510137021541595, + "rewards/margins": 0.07678806781768799, + "rewards/rejected": -0.04527793079614639, + "step": 14140 + }, + { + "epoch": 0.93, + "learning_rate": 8.338454213564052e-08, + "logits/chosen": -2.291496753692627, + "logits/rejected": -2.0189805030822754, + "logps/chosen": -230.54672241210938, + "logps/rejected": -223.07864379882812, + "loss": 0.6895, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0035024010576307774, + "rewards/margins": 0.09726149588823318, + "rewards/rejected": -0.09375908970832825, + "step": 14150 + }, + { + "epoch": 0.93, + "learning_rate": 8.192844991656679e-08, + "logits/chosen": -2.2867093086242676, + "logits/rejected": -2.0489730834960938, + "logps/chosen": -237.1662139892578, + "logps/rejected": -208.5237579345703, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.015504756942391396, + "rewards/margins": 0.07765550911426544, + "rewards/rejected": -0.0621507465839386, + "step": 14160 + }, + { + "epoch": 0.93, + "learning_rate": 8.048497154320434e-08, + "logits/chosen": -2.3233590126037598, + "logits/rejected": -2.3688528537750244, + "logps/chosen": -130.73397827148438, + "logps/rejected": -147.06256103515625, + "loss": 0.6895, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.011844434775412083, + "rewards/margins": 0.07231110334396362, + "rewards/rejected": -0.08415552228689194, + "step": 14170 + }, + { + "epoch": 0.93, + "learning_rate": 7.905411454544265e-08, + "logits/chosen": -2.324502468109131, + "logits/rejected": -2.174683094024658, + "logps/chosen": -236.509765625, + "logps/rejected": -246.95889282226562, + "loss": 0.6913, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -5.5506454373244196e-05, + "rewards/margins": 0.057068269699811935, + "rewards/rejected": -0.057123780250549316, + "step": 14180 + }, + { + "epoch": 0.93, + "learning_rate": 7.763588638733332e-08, + "logits/chosen": -2.3418097496032715, + "logits/rejected": -2.2953555583953857, + "logps/chosen": -260.10223388671875, + "logps/rejected": -249.2476348876953, + "loss": 0.6883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.023179076611995697, + "rewards/margins": 0.08969531953334808, + "rewards/rejected": -0.06651624292135239, + "step": 14190 + }, + { + "epoch": 0.93, + "learning_rate": 7.623029446704899e-08, + "logits/chosen": -2.1991162300109863, + "logits/rejected": -2.3756251335144043, + "logps/chosen": -308.20355224609375, + "logps/rejected": -282.3921813964844, + "loss": 0.6894, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03237896040081978, + "rewards/margins": 0.11083276569843292, + "rewards/rejected": -0.07845381647348404, + "step": 14200 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.325500965118408, + "eval_logits/rejected": -2.1368813514709473, + "eval_logps/chosen": -230.51712036132812, + "eval_logps/rejected": -218.59410095214844, + "eval_loss": 0.6897341012954712, + "eval_rewards/accuracies": 0.637499988079071, + "eval_rewards/chosen": 0.014878012239933014, + "eval_rewards/margins": 0.08469977974891663, + "eval_rewards/rejected": -0.06982176750898361, + "eval_runtime": 711.1188, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 14200 + }, + { + "epoch": 0.93, + "learning_rate": 7.483734611684557e-08, + "logits/chosen": -2.124391555786133, + "logits/rejected": -1.9591686725616455, + "logps/chosen": -254.62777709960938, + "logps/rejected": -209.1081085205078, + "loss": 0.6913, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022580375894904137, + "rewards/margins": 0.08641272783279419, + "rewards/rejected": -0.0638323426246643, + "step": 14210 + }, + { + "epoch": 0.93, + "learning_rate": 7.345704860302366e-08, + "logits/chosen": -2.399385690689087, + "logits/rejected": -2.3872694969177246, + "logps/chosen": -246.51803588867188, + "logps/rejected": -255.66213989257812, + "loss": 0.6897, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.017346328124403954, + "rewards/margins": 0.09822475165128708, + "rewards/rejected": -0.08087843656539917, + "step": 14220 + }, + { + "epoch": 0.93, + "learning_rate": 7.208940912589224e-08, + "logits/chosen": -2.334130048751831, + "logits/rejected": -1.9988048076629639, + "logps/chosen": -210.1862335205078, + "logps/rejected": -185.14744567871094, + "loss": 0.6858, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0059993029572069645, + "rewards/margins": 0.12339667230844498, + "rewards/rejected": -0.1293959617614746, + "step": 14230 + }, + { + "epoch": 0.93, + "learning_rate": 7.073443481972753e-08, + "logits/chosen": -2.1471316814422607, + "logits/rejected": -2.125434637069702, + "logps/chosen": -184.0389404296875, + "logps/rejected": -211.4552459716797, + "loss": 0.6878, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0011679657036438584, + "rewards/margins": 0.07914597541093826, + "rewards/rejected": -0.07797800749540329, + "step": 14240 + }, + { + "epoch": 0.93, + "learning_rate": 6.939213275274027e-08, + "logits/chosen": -2.2806684970855713, + "logits/rejected": -2.240402936935425, + "logps/chosen": -238.58798217773438, + "logps/rejected": -225.0943603515625, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006657888181507587, + "rewards/margins": 0.06109069660305977, + "rewards/rejected": -0.054432809352874756, + "step": 14250 + }, + { + "epoch": 0.93, + "learning_rate": 6.806250992703461e-08, + "logits/chosen": -2.3053228855133057, + "logits/rejected": -2.15181303024292, + "logps/chosen": -214.410400390625, + "logps/rejected": -195.6936798095703, + "loss": 0.6905, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.022841984406113625, + "rewards/margins": 0.07493428885936737, + "rewards/rejected": -0.0520923025906086, + "step": 14260 + }, + { + "epoch": 0.93, + "learning_rate": 6.674557327857572e-08, + "logits/chosen": -2.2985949516296387, + "logits/rejected": -2.311959743499756, + "logps/chosen": -251.68814086914062, + "logps/rejected": -254.1820831298828, + "loss": 0.6867, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.025825273245573044, + "rewards/margins": 0.11581947654485703, + "rewards/rejected": -0.08999422192573547, + "step": 14270 + }, + { + "epoch": 0.93, + "learning_rate": 6.544132967714917e-08, + "logits/chosen": -2.0479283332824707, + "logits/rejected": -2.0568835735321045, + "logps/chosen": -246.87765502929688, + "logps/rejected": -244.86984252929688, + "loss": 0.688, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.00017667413339950144, + "rewards/margins": 0.11704652011394501, + "rewards/rejected": -0.11722320318222046, + "step": 14280 + }, + { + "epoch": 0.93, + "learning_rate": 6.414978592632932e-08, + "logits/chosen": -2.3786206245422363, + "logits/rejected": -1.9369407892227173, + "logps/chosen": -260.21160888671875, + "logps/rejected": -222.98580932617188, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008877063170075417, + "rewards/margins": 0.09229589253664017, + "rewards/rejected": -0.0834188237786293, + "step": 14290 + }, + { + "epoch": 0.94, + "learning_rate": 6.287094876344046e-08, + "logits/chosen": -2.3294384479522705, + "logits/rejected": -2.362797260284424, + "logps/chosen": -168.59146118164062, + "logps/rejected": -183.0385284423828, + "loss": 0.6912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03828797861933708, + "rewards/margins": 0.07808025181293488, + "rewards/rejected": -0.0397922620177269, + "step": 14300 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.325085163116455, + "eval_logits/rejected": -2.1365013122558594, + "eval_logps/chosen": -230.55078125, + "eval_logps/rejected": -218.63143920898438, + "eval_loss": 0.6897284388542175, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": 0.014541618525981903, + "eval_rewards/margins": 0.08473705500364304, + "eval_rewards/rejected": -0.07019543647766113, + "eval_runtime": 710.3597, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.408, + "step": 14300 + }, + { + "epoch": 0.94, + "learning_rate": 6.160482485952413e-08, + "logits/chosen": -2.46873140335083, + "logits/rejected": -2.203112840652466, + "logps/chosen": -236.5559539794922, + "logps/rejected": -211.58816528320312, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.001061995280906558, + "rewards/margins": 0.0682806596159935, + "rewards/rejected": -0.06934265792369843, + "step": 14310 + }, + { + "epoch": 0.94, + "learning_rate": 6.035142081930234e-08, + "logits/chosen": -2.333582639694214, + "logits/rejected": -1.9607412815093994, + "logps/chosen": -263.0543518066406, + "logps/rejected": -192.89089965820312, + "loss": 0.6906, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.002580016851425171, + "rewards/margins": 0.0685344785451889, + "rewards/rejected": -0.07111448794603348, + "step": 14320 + }, + { + "epoch": 0.94, + "learning_rate": 5.911074318114496e-08, + "logits/chosen": -2.16135311126709, + "logits/rejected": -2.2769253253936768, + "logps/chosen": -202.2639617919922, + "logps/rejected": -251.79052734375, + "loss": 0.6907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01324182003736496, + "rewards/margins": 0.07401735335588455, + "rewards/rejected": -0.060775529593229294, + "step": 14330 + }, + { + "epoch": 0.94, + "learning_rate": 5.788279841703381e-08, + "logits/chosen": -2.3670153617858887, + "logits/rejected": -2.1148641109466553, + "logps/chosen": -182.29180908203125, + "logps/rejected": -184.1455535888672, + "loss": 0.6891, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.01617772877216339, + "rewards/margins": 0.09520837664604187, + "rewards/rejected": -0.07903064042329788, + "step": 14340 + }, + { + "epoch": 0.94, + "learning_rate": 5.66675929325311e-08, + "logits/chosen": -2.369096279144287, + "logits/rejected": -2.1264519691467285, + "logps/chosen": -218.7493438720703, + "logps/rejected": -212.5388641357422, + "loss": 0.6913, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.019151246175169945, + "rewards/margins": 0.05138329789042473, + "rewards/rejected": -0.032232046127319336, + "step": 14350 + }, + { + "epoch": 0.94, + "learning_rate": 5.546513306674301e-08, + "logits/chosen": -2.2958080768585205, + "logits/rejected": -1.9007478952407837, + "logps/chosen": -276.90521240234375, + "logps/rejected": -211.9548797607422, + "loss": 0.6883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.016114667057991028, + "rewards/margins": 0.10132592916488647, + "rewards/rejected": -0.08521126955747604, + "step": 14360 + }, + { + "epoch": 0.94, + "learning_rate": 5.4275425092290004e-08, + "logits/chosen": -2.4178357124328613, + "logits/rejected": -2.370293140411377, + "logps/chosen": -252.72412109375, + "logps/rejected": -242.9341278076172, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.020720353350043297, + "rewards/margins": 0.08652675151824951, + "rewards/rejected": -0.06580640375614166, + "step": 14370 + }, + { + "epoch": 0.94, + "learning_rate": 5.309847521527078e-08, + "logits/chosen": -2.2703206539154053, + "logits/rejected": -1.9016317129135132, + "logps/chosen": -283.227294921875, + "logps/rejected": -247.84408569335938, + "loss": 0.6888, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.013460688292980194, + "rewards/margins": 0.07545115798711777, + "rewards/rejected": -0.06199047714471817, + "step": 14380 + }, + { + "epoch": 0.94, + "learning_rate": 5.1934289575233385e-08, + "logits/chosen": -2.147324323654175, + "logits/rejected": -1.8097765445709229, + "logps/chosen": -240.4329071044922, + "logps/rejected": -215.96237182617188, + "loss": 0.6894, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009034966118633747, + "rewards/margins": 0.09896841645240784, + "rewards/rejected": -0.10800337791442871, + "step": 14390 + }, + { + "epoch": 0.94, + "learning_rate": 5.078287424513994e-08, + "logits/chosen": -2.390627384185791, + "logits/rejected": -2.2773404121398926, + "logps/chosen": -270.3782653808594, + "logps/rejected": -206.30615234375, + "loss": 0.6893, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.003305424703285098, + "rewards/margins": 0.10758145898580551, + "rewards/rejected": -0.11088689416646957, + "step": 14400 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.3247244358062744, + "eval_logits/rejected": -2.1361277103424072, + "eval_logps/chosen": -230.61825561523438, + "eval_logps/rejected": -218.7084503173828, + "eval_loss": 0.6897271871566772, + "eval_rewards/accuracies": 0.640999972820282, + "eval_rewards/chosen": 0.013867066241800785, + "eval_rewards/margins": 0.08483249694108963, + "eval_rewards/rejected": -0.07096543163061142, + "eval_runtime": 710.9987, + "eval_samples_per_second": 2.813, + "eval_steps_per_second": 1.406, + "step": 14400 + }, + { + "epoch": 0.94, + "learning_rate": 4.964423523133671e-08, + "logits/chosen": -2.3970162868499756, + "logits/rejected": -2.158749580383301, + "logps/chosen": -215.6207275390625, + "logps/rejected": -187.32325744628906, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.021107520908117294, + "rewards/margins": 0.060516953468322754, + "rewards/rejected": -0.03940943256020546, + "step": 14410 + }, + { + "epoch": 0.94, + "learning_rate": 4.8518378473522976e-08, + "logits/chosen": -2.262331008911133, + "logits/rejected": -2.106858253479004, + "logps/chosen": -250.4281768798828, + "logps/rejected": -251.893310546875, + "loss": 0.6871, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.000816689629573375, + "rewards/margins": 0.08192334324121475, + "rewards/rejected": -0.08110664784908295, + "step": 14420 + }, + { + "epoch": 0.94, + "learning_rate": 4.7405309844718584e-08, + "logits/chosen": -2.184021472930908, + "logits/rejected": -2.0803000926971436, + "logps/chosen": -192.67074584960938, + "logps/rejected": -213.4363250732422, + "loss": 0.6872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0017172780353575945, + "rewards/margins": 0.11569315195083618, + "rewards/rejected": -0.11741043627262115, + "step": 14430 + }, + { + "epoch": 0.94, + "learning_rate": 4.630503515123508e-08, + "logits/chosen": -2.426945209503174, + "logits/rejected": -2.128527879714966, + "logps/chosen": -200.57327270507812, + "logps/rejected": -160.0536651611328, + "loss": 0.6882, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0029270625673234463, + "rewards/margins": 0.09346749633550644, + "rewards/rejected": -0.09054042398929596, + "step": 14440 + }, + { + "epoch": 0.95, + "learning_rate": 4.5217560132644056e-08, + "logits/chosen": -2.2494208812713623, + "logits/rejected": -2.1792380809783936, + "logps/chosen": -148.93338012695312, + "logps/rejected": -174.1099395751953, + "loss": 0.6905, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.016862403601408005, + "rewards/margins": 0.06775657087564468, + "rewards/rejected": -0.05089417099952698, + "step": 14450 + }, + { + "epoch": 0.95, + "learning_rate": 4.41428904617483e-08, + "logits/chosen": -2.2892022132873535, + "logits/rejected": -2.2819294929504395, + "logps/chosen": -177.44277954101562, + "logps/rejected": -189.7292938232422, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0033408640883862972, + "rewards/margins": 0.07625994831323624, + "rewards/rejected": -0.07960081100463867, + "step": 14460 + }, + { + "epoch": 0.95, + "learning_rate": 4.3081031744550696e-08, + "logits/chosen": -2.3867905139923096, + "logits/rejected": -2.3108649253845215, + "logps/chosen": -250.4009552001953, + "logps/rejected": -236.97607421875, + "loss": 0.6895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.027443695813417435, + "rewards/margins": 0.08902832120656967, + "rewards/rejected": -0.061584629118442535, + "step": 14470 + }, + { + "epoch": 0.95, + "learning_rate": 4.2031989520227025e-08, + "logits/chosen": -2.3677258491516113, + "logits/rejected": -2.1632471084594727, + "logps/chosen": -225.92098999023438, + "logps/rejected": -210.75198364257812, + "loss": 0.6907, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.023934394121170044, + "rewards/margins": 0.06718473136425018, + "rewards/rejected": -0.04325033351778984, + "step": 14480 + }, + { + "epoch": 0.95, + "learning_rate": 4.099576926109461e-08, + "logits/chosen": -2.445075273513794, + "logits/rejected": -1.9394609928131104, + "logps/chosen": -237.6958465576172, + "logps/rejected": -165.7293243408203, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0208599753677845, + "rewards/margins": 0.07187938690185547, + "rewards/rejected": -0.05101940780878067, + "step": 14490 + }, + { + "epoch": 0.95, + "learning_rate": 3.997237637258705e-08, + "logits/chosen": -2.277336835861206, + "logits/rejected": -2.330341339111328, + "logps/chosen": -313.8475341796875, + "logps/rejected": -279.31109619140625, + "loss": 0.6914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03405776619911194, + "rewards/margins": 0.08806699514389038, + "rewards/rejected": -0.05400923639535904, + "step": 14500 + }, + { + "epoch": 0.95, + "eval_logits/chosen": -2.3249900341033936, + "eval_logits/rejected": -2.136406660079956, + "eval_logps/chosen": -230.61793518066406, + "eval_logps/rejected": -218.70700073242188, + "eval_loss": 0.6897269487380981, + "eval_rewards/accuracies": 0.6370000243186951, + "eval_rewards/chosen": 0.013869978487491608, + "eval_rewards/margins": 0.08482073247432709, + "eval_rewards/rejected": -0.07095075398683548, + "eval_runtime": 710.297, + "eval_samples_per_second": 2.816, + "eval_steps_per_second": 1.408, + "step": 14500 + }, + { + "epoch": 0.95, + "learning_rate": 3.8961816193222035e-08, + "logits/chosen": -2.4175612926483154, + "logits/rejected": -2.200261354446411, + "logps/chosen": -237.2295379638672, + "logps/rejected": -184.6630096435547, + "loss": 0.6922, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.013792415149509907, + "rewards/margins": 0.04766743257641792, + "rewards/rejected": -0.061459846794605255, + "step": 14510 + }, + { + "epoch": 0.95, + "learning_rate": 3.79640939945769e-08, + "logits/chosen": -2.3744473457336426, + "logits/rejected": -2.240773916244507, + "logps/chosen": -283.0396728515625, + "logps/rejected": -200.9557647705078, + "loss": 0.6922, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03181237354874611, + "rewards/margins": 0.04180184006690979, + "rewards/rejected": -0.009989465586841106, + "step": 14520 + }, + { + "epoch": 0.95, + "learning_rate": 3.697921498125895e-08, + "logits/chosen": -2.109978199005127, + "logits/rejected": -2.2066032886505127, + "logps/chosen": -211.96316528320312, + "logps/rejected": -225.83584594726562, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0201013945043087, + "rewards/margins": 0.07992889732122421, + "rewards/rejected": -0.10003030300140381, + "step": 14530 + }, + { + "epoch": 0.95, + "learning_rate": 3.6007184290880456e-08, + "logits/chosen": -2.3271656036376953, + "logits/rejected": -2.227792739868164, + "logps/chosen": -211.0209197998047, + "logps/rejected": -203.0982666015625, + "loss": 0.6908, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.009520738385617733, + "rewards/margins": 0.08740987628698349, + "rewards/rejected": -0.07788912951946259, + "step": 14540 + }, + { + "epoch": 0.95, + "learning_rate": 3.504800699402872e-08, + "logits/chosen": -2.557425022125244, + "logits/rejected": -2.2593464851379395, + "logps/chosen": -342.3351135253906, + "logps/rejected": -273.93585205078125, + "loss": 0.6919, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03319120034575462, + "rewards/margins": 0.04967629909515381, + "rewards/rejected": -0.016485098749399185, + "step": 14550 + }, + { + "epoch": 0.95, + "learning_rate": 3.4101688094242967e-08, + "logits/chosen": -2.264317035675049, + "logits/rejected": -2.1588878631591797, + "logps/chosen": -270.6173095703125, + "logps/rejected": -257.766357421875, + "loss": 0.6898, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.010563389398157597, + "rewards/margins": 0.12325240671634674, + "rewards/rejected": -0.13381578028202057, + "step": 14560 + }, + { + "epoch": 0.95, + "learning_rate": 3.3168232527985564e-08, + "logits/chosen": -2.2050108909606934, + "logits/rejected": -1.9072158336639404, + "logps/chosen": -241.6295166015625, + "logps/rejected": -194.75392150878906, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00010975040640914813, + "rewards/margins": 0.07874272018671036, + "rewards/rejected": -0.07885247468948364, + "step": 14570 + }, + { + "epoch": 0.95, + "learning_rate": 3.224764516461892e-08, + "logits/chosen": -2.3158745765686035, + "logits/rejected": -2.1368792057037354, + "logps/chosen": -252.09890747070312, + "logps/rejected": -232.61062622070312, + "loss": 0.6878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.028999557718634605, + "rewards/margins": 0.1126040369272232, + "rewards/rejected": -0.08360447734594345, + "step": 14580 + }, + { + "epoch": 0.95, + "learning_rate": 3.133993080637665e-08, + "logits/chosen": -2.2999939918518066, + "logits/rejected": -2.108703374862671, + "logps/chosen": -201.35792541503906, + "logps/rejected": -199.75430297851562, + "loss": 0.6894, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.015230001881718636, + "rewards/margins": 0.10807321965694427, + "rewards/rejected": -0.09284321218729019, + "step": 14590 + }, + { + "epoch": 0.96, + "learning_rate": 3.0445094188342186e-08, + "logits/chosen": -2.112121343612671, + "logits/rejected": -1.8166097402572632, + "logps/chosen": -253.4682159423828, + "logps/rejected": -182.1717987060547, + "loss": 0.6897, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 8.347779657924548e-05, + "rewards/margins": 0.08967778086662292, + "rewards/rejected": -0.0895942971110344, + "step": 14600 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -2.3248980045318604, + "eval_logits/rejected": -2.1363399028778076, + "eval_logps/chosen": -230.62680053710938, + "eval_logps/rejected": -218.67767333984375, + "eval_loss": 0.6897242665290833, + "eval_rewards/accuracies": 0.6355000138282776, + "eval_rewards/chosen": 0.013781617395579815, + "eval_rewards/margins": 0.08443937450647354, + "eval_rewards/rejected": -0.07065775245428085, + "eval_runtime": 710.5131, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.407, + "step": 14600 + }, + { + "epoch": 0.96, + "learning_rate": 2.9563139978421028e-08, + "logits/chosen": -2.2371926307678223, + "logits/rejected": -2.259657382965088, + "logps/chosen": -223.68856811523438, + "logps/rejected": -221.4380340576172, + "loss": 0.6905, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.021833080798387527, + "rewards/margins": 0.05533973500132561, + "rewards/rejected": -0.03350665792822838, + "step": 14610 + }, + { + "epoch": 0.96, + "learning_rate": 2.869407277731939e-08, + "logits/chosen": -2.2002997398376465, + "logits/rejected": -2.1275038719177246, + "logps/chosen": -183.22903442382812, + "logps/rejected": -168.91224670410156, + "loss": 0.6902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02028103545308113, + "rewards/margins": 0.08852804452180862, + "rewards/rejected": -0.06824701279401779, + "step": 14620 + }, + { + "epoch": 0.96, + "learning_rate": 2.783789711851642e-08, + "logits/chosen": -2.3309357166290283, + "logits/rejected": -2.1030194759368896, + "logps/chosen": -157.8726043701172, + "logps/rejected": -147.86341857910156, + "loss": 0.6878, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03978399187326431, + "rewards/margins": 0.12423492968082428, + "rewards/rejected": -0.08445094525814056, + "step": 14630 + }, + { + "epoch": 0.96, + "learning_rate": 2.6994617468244778e-08, + "logits/chosen": -2.4272868633270264, + "logits/rejected": -1.9481617212295532, + "logps/chosen": -210.47348022460938, + "logps/rejected": -158.01644897460938, + "loss": 0.689, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02547174133360386, + "rewards/margins": 0.11461669206619263, + "rewards/rejected": -0.08914494514465332, + "step": 14640 + }, + { + "epoch": 0.96, + "learning_rate": 2.6164238225463155e-08, + "logits/chosen": -2.264310598373413, + "logits/rejected": -1.9083404541015625, + "logps/chosen": -281.5527648925781, + "logps/rejected": -212.22207641601562, + "loss": 0.6907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0087438328191638, + "rewards/margins": 0.09177269041538239, + "rewards/rejected": -0.08302884548902512, + "step": 14650 + }, + { + "epoch": 0.96, + "learning_rate": 2.534676372183742e-08, + "logits/chosen": -2.288485527038574, + "logits/rejected": -2.1355769634246826, + "logps/chosen": -282.95892333984375, + "logps/rejected": -237.60372924804688, + "loss": 0.6901, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.009953884407877922, + "rewards/margins": 0.06971423327922821, + "rewards/rejected": -0.05976034328341484, + "step": 14660 + }, + { + "epoch": 0.96, + "learning_rate": 2.4542198221714218e-08, + "logits/chosen": -2.1728930473327637, + "logits/rejected": -1.9003547430038452, + "logps/chosen": -137.03646850585938, + "logps/rejected": -146.15728759765625, + "loss": 0.6883, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.010704811662435532, + "rewards/margins": 0.08893907815217972, + "rewards/rejected": -0.07823427021503448, + "step": 14670 + }, + { + "epoch": 0.96, + "learning_rate": 2.3750545922101854e-08, + "logits/chosen": -2.5925240516662598, + "logits/rejected": -2.207447052001953, + "logps/chosen": -308.4715270996094, + "logps/rejected": -248.4219207763672, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009196789935231209, + "rewards/margins": 0.08344466239213943, + "rewards/rejected": -0.07424787431955338, + "step": 14680 + }, + { + "epoch": 0.96, + "learning_rate": 2.2971810952646112e-08, + "logits/chosen": -2.3056700229644775, + "logits/rejected": -2.2151083946228027, + "logps/chosen": -256.753662109375, + "logps/rejected": -214.6802215576172, + "loss": 0.6908, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.01609097793698311, + "rewards/margins": 0.056323904544115067, + "rewards/rejected": -0.04023292288184166, + "step": 14690 + }, + { + "epoch": 0.96, + "learning_rate": 2.2205997375610576e-08, + "logits/chosen": -2.1445729732513428, + "logits/rejected": -2.0863611698150635, + "logps/chosen": -181.83251953125, + "logps/rejected": -196.3846435546875, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03673393279314041, + "rewards/margins": 0.09392055124044418, + "rewards/rejected": -0.057186610996723175, + "step": 14700 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -2.324772834777832, + "eval_logits/rejected": -2.1362075805664062, + "eval_logps/chosen": -230.6251678466797, + "eval_logps/rejected": -218.66000366210938, + "eval_loss": 0.6897311806678772, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": 0.013797725550830364, + "eval_rewards/margins": 0.08427882194519043, + "eval_rewards/rejected": -0.07048109173774719, + "eval_runtime": 709.9702, + "eval_samples_per_second": 2.817, + "eval_steps_per_second": 1.409, + "step": 14700 + }, + { + "epoch": 0.96, + "learning_rate": 2.1453109185853304e-08, + "logits/chosen": -2.3570213317871094, + "logits/rejected": -2.299567461013794, + "logps/chosen": -199.55894470214844, + "logps/rejected": -209.33212280273438, + "loss": 0.689, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02519531175494194, + "rewards/margins": 0.08025936782360077, + "rewards/rejected": -0.05506405234336853, + "step": 14710 + }, + { + "epoch": 0.96, + "learning_rate": 2.0713150310808784e-08, + "logits/chosen": -2.121694564819336, + "logits/rejected": -2.446938991546631, + "logps/chosen": -226.5384979248047, + "logps/rejected": -235.46133422851562, + "loss": 0.6924, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.010506866499781609, + "rewards/margins": 0.031594760715961456, + "rewards/rejected": -0.042101629078388214, + "step": 14720 + }, + { + "epoch": 0.96, + "learning_rate": 1.9986124610464064e-08, + "logits/chosen": -2.2158148288726807, + "logits/rejected": -1.9099407196044922, + "logps/chosen": -285.78350830078125, + "logps/rejected": -228.42324829101562, + "loss": 0.6882, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.014465957880020142, + "rewards/margins": 0.12452026456594467, + "rewards/rejected": -0.11005431413650513, + "step": 14730 + }, + { + "epoch": 0.96, + "learning_rate": 1.927203587734211e-08, + "logits/chosen": -2.19920015335083, + "logits/rejected": -1.8051159381866455, + "logps/chosen": -250.2556915283203, + "logps/rejected": -210.61441040039062, + "loss": 0.6888, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.027360107749700546, + "rewards/margins": 0.08885184675455093, + "rewards/rejected": -0.06149173900485039, + "step": 14740 + }, + { + "epoch": 0.97, + "learning_rate": 1.8570887836479034e-08, + "logits/chosen": -2.3139779567718506, + "logits/rejected": -2.1160130500793457, + "logps/chosen": -195.26492309570312, + "logps/rejected": -255.01797485351562, + "loss": 0.6899, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.016417790204286575, + "rewards/margins": 0.06937674432992935, + "rewards/rejected": -0.08579452335834503, + "step": 14750 + }, + { + "epoch": 0.97, + "learning_rate": 1.7882684145406616e-08, + "logits/chosen": -2.3864521980285645, + "logits/rejected": -2.3232614994049072, + "logps/chosen": -292.4091796875, + "logps/rejected": -298.75286865234375, + "loss": 0.6878, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04011436551809311, + "rewards/margins": 0.0823252946138382, + "rewards/rejected": -0.04221092164516449, + "step": 14760 + }, + { + "epoch": 0.97, + "learning_rate": 1.7207428394132865e-08, + "logits/chosen": -2.5386404991149902, + "logits/rejected": -2.052577018737793, + "logps/chosen": -268.1858825683594, + "logps/rejected": -225.7917938232422, + "loss": 0.6873, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.01594383455812931, + "rewards/margins": 0.12293130159378052, + "rewards/rejected": -0.10698747634887695, + "step": 14770 + }, + { + "epoch": 0.97, + "learning_rate": 1.654512410512177e-08, + "logits/chosen": -2.2604587078094482, + "logits/rejected": -2.0169408321380615, + "logps/chosen": -253.12109375, + "logps/rejected": -196.11129760742188, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004558461718261242, + "rewards/margins": 0.0537867471575737, + "rewards/rejected": -0.04922827333211899, + "step": 14780 + }, + { + "epoch": 0.97, + "learning_rate": 1.5895774733277468e-08, + "logits/chosen": -2.3233249187469482, + "logits/rejected": -1.9812166690826416, + "logps/chosen": -277.13775634765625, + "logps/rejected": -236.5921630859375, + "loss": 0.6891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.026791805401444435, + "rewards/margins": 0.09690927714109421, + "rewards/rejected": -0.07011748105287552, + "step": 14790 + }, + { + "epoch": 0.97, + "learning_rate": 1.5259383665924e-08, + "logits/chosen": -2.584911823272705, + "logits/rejected": -2.1934475898742676, + "logps/chosen": -337.8364562988281, + "logps/rejected": -252.3568572998047, + "loss": 0.6897, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.042554210871458054, + "rewards/margins": 0.08529296517372131, + "rewards/rejected": -0.04273875802755356, + "step": 14800 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -2.324951171875, + "eval_logits/rejected": -2.136404037475586, + "eval_logps/chosen": -230.61355590820312, + "eval_logps/rejected": -218.665283203125, + "eval_loss": 0.6897242665290833, + "eval_rewards/accuracies": 0.6340000033378601, + "eval_rewards/chosen": 0.013913972303271294, + "eval_rewards/margins": 0.08444766700267792, + "eval_rewards/rejected": -0.07053370773792267, + "eval_runtime": 710.025, + "eval_samples_per_second": 2.817, + "eval_steps_per_second": 1.408, + "step": 14800 + }, + { + "epoch": 0.97, + "learning_rate": 1.4635954222789461e-08, + "logits/chosen": -2.2716784477233887, + "logits/rejected": -2.1852469444274902, + "logps/chosen": -214.40774536132812, + "logps/rejected": -227.9254150390625, + "loss": 0.6912, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.026998598128557205, + "rewards/margins": 0.08409784734249115, + "rewards/rejected": -0.05709924176335335, + "step": 14810 + }, + { + "epoch": 0.97, + "learning_rate": 1.402548965598688e-08, + "logits/chosen": -2.210674285888672, + "logits/rejected": -2.305459499359131, + "logps/chosen": -202.4854736328125, + "logps/rejected": -205.67385864257812, + "loss": 0.6903, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0214177705347538, + "rewards/margins": 0.06404820829629898, + "rewards/rejected": -0.04263044521212578, + "step": 14820 + }, + { + "epoch": 0.97, + "learning_rate": 1.3427993149998375e-08, + "logits/chosen": -2.4624884128570557, + "logits/rejected": -2.211487293243408, + "logps/chosen": -238.8626251220703, + "logps/rejected": -194.0426483154297, + "loss": 0.6889, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03174557164311409, + "rewards/margins": 0.10550177097320557, + "rewards/rejected": -0.07375619560480118, + "step": 14830 + }, + { + "epoch": 0.97, + "learning_rate": 1.2843467821658518e-08, + "logits/chosen": -2.4166269302368164, + "logits/rejected": -2.350491523742676, + "logps/chosen": -222.0076141357422, + "logps/rejected": -231.8187713623047, + "loss": 0.6888, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.04215265437960625, + "rewards/margins": 0.1017962247133255, + "rewards/rejected": -0.05964357778429985, + "step": 14840 + }, + { + "epoch": 0.97, + "learning_rate": 1.2271916720137666e-08, + "logits/chosen": -2.52500581741333, + "logits/rejected": -2.208137035369873, + "logps/chosen": -285.6896057128906, + "logps/rejected": -245.7957000732422, + "loss": 0.6923, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0007992383325472474, + "rewards/margins": 0.06241076439619064, + "rewards/rejected": -0.06161152571439743, + "step": 14850 + }, + { + "epoch": 0.97, + "learning_rate": 1.171334282692671e-08, + "logits/chosen": -2.3669638633728027, + "logits/rejected": -2.2914958000183105, + "logps/chosen": -280.2817077636719, + "logps/rejected": -260.38970947265625, + "loss": 0.6904, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.023167315870523453, + "rewards/margins": 0.10739920288324356, + "rewards/rejected": -0.084231898188591, + "step": 14860 + }, + { + "epoch": 0.97, + "learning_rate": 1.116774905582041e-08, + "logits/chosen": -2.4192698001861572, + "logits/rejected": -2.087759017944336, + "logps/chosen": -183.83871459960938, + "logps/rejected": -177.45944213867188, + "loss": 0.6911, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00979800708591938, + "rewards/margins": 0.06323973834514618, + "rewards/rejected": -0.05344173312187195, + "step": 14870 + }, + { + "epoch": 0.97, + "learning_rate": 1.0635138252902966e-08, + "logits/chosen": -2.504063129425049, + "logits/rejected": -2.2471530437469482, + "logps/chosen": -229.2535400390625, + "logps/rejected": -215.491455078125, + "loss": 0.6889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.011604288592934608, + "rewards/margins": 0.08366361260414124, + "rewards/rejected": -0.07205932587385178, + "step": 14880 + }, + { + "epoch": 0.97, + "learning_rate": 1.0115513196533589e-08, + "logits/chosen": -2.338160991668701, + "logits/rejected": -2.171861410140991, + "logps/chosen": -261.0177001953125, + "logps/rejected": -249.5078125, + "loss": 0.6914, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.015924451872706413, + "rewards/margins": 0.05798298865556717, + "rewards/rejected": -0.04205853492021561, + "step": 14890 + }, + { + "epoch": 0.97, + "learning_rate": 9.608876597330952e-09, + "logits/chosen": -2.3476691246032715, + "logits/rejected": -2.022096633911133, + "logps/chosen": -288.65777587890625, + "logps/rejected": -285.6450500488281, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.020350560545921326, + "rewards/margins": 0.08672511577606201, + "rewards/rejected": -0.10707566887140274, + "step": 14900 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -2.3250153064727783, + "eval_logits/rejected": -2.1365039348602295, + "eval_logps/chosen": -230.6240997314453, + "eval_logps/rejected": -218.6448974609375, + "eval_loss": 0.6897345185279846, + "eval_rewards/accuracies": 0.6380000114440918, + "eval_rewards/chosen": 0.013808542862534523, + "eval_rewards/margins": 0.08413854986429214, + "eval_rewards/rejected": -0.07033000141382217, + "eval_runtime": 709.7106, + "eval_samples_per_second": 2.818, + "eval_steps_per_second": 1.409, + "step": 14900 + }, + { + "epoch": 0.98, + "learning_rate": 9.115231098159594e-09, + "logits/chosen": -2.4037442207336426, + "logits/rejected": -2.292450189590454, + "logps/chosen": -251.7987518310547, + "logps/rejected": -238.9403076171875, + "loss": 0.691, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.015149926766753197, + "rewards/margins": 0.074435293674469, + "rewards/rejected": -0.05928536504507065, + "step": 14910 + }, + { + "epoch": 0.98, + "learning_rate": 8.634579274116317e-09, + "logits/chosen": -2.260741710662842, + "logits/rejected": -2.1925673484802246, + "logps/chosen": -188.34481811523438, + "logps/rejected": -219.61099243164062, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01714429259300232, + "rewards/margins": 0.10656633228063583, + "rewards/rejected": -0.08942203223705292, + "step": 14920 + }, + { + "epoch": 0.98, + "learning_rate": 8.166923632516865e-09, + "logits/chosen": -2.439234733581543, + "logits/rejected": -2.150946617126465, + "logps/chosen": -225.3989715576172, + "logps/rejected": -278.0585021972656, + "loss": 0.6849, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02332211658358574, + "rewards/margins": 0.1503240466117859, + "rewards/rejected": -0.12700191140174866, + "step": 14930 + }, + { + "epoch": 0.98, + "learning_rate": 7.712266612881492e-09, + "logits/chosen": -2.1845946311950684, + "logits/rejected": -2.0539097785949707, + "logps/chosen": -181.65396118164062, + "logps/rejected": -185.66207885742188, + "loss": 0.69, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0358271598815918, + "rewards/margins": 0.088839091360569, + "rewards/rejected": -0.0530119314789772, + "step": 14940 + }, + { + "epoch": 0.98, + "learning_rate": 7.270610586924687e-09, + "logits/chosen": -2.450291872024536, + "logits/rejected": -2.2107715606689453, + "logps/chosen": -260.78521728515625, + "logps/rejected": -224.33633422851562, + "loss": 0.6905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.041800957173109055, + "rewards/margins": 0.07631386816501617, + "rewards/rejected": -0.03451291099190712, + "step": 14950 + }, + { + "epoch": 0.98, + "learning_rate": 6.841957858539916e-09, + "logits/chosen": -2.2664036750793457, + "logits/rejected": -2.1220388412475586, + "logps/chosen": -168.59519958496094, + "logps/rejected": -185.31361389160156, + "loss": 0.6911, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.019396457821130753, + "rewards/margins": 0.05867626518011093, + "rewards/rejected": -0.07807272672653198, + "step": 14960 + }, + { + "epoch": 0.98, + "learning_rate": 6.426310663790181e-09, + "logits/chosen": -2.1963348388671875, + "logits/rejected": -2.1024935245513916, + "logps/chosen": -236.64208984375, + "logps/rejected": -211.93154907226562, + "loss": 0.6904, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0033364570699632168, + "rewards/margins": 0.06757830083370209, + "rewards/rejected": -0.06424184143543243, + "step": 14970 + }, + { + "epoch": 0.98, + "learning_rate": 6.023671170894696e-09, + "logits/chosen": -2.5223042964935303, + "logits/rejected": -1.931014060974121, + "logps/chosen": -288.4738464355469, + "logps/rejected": -213.46670532226562, + "loss": 0.6883, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.03854692354798317, + "rewards/margins": 0.11772453784942627, + "rewards/rejected": -0.0791776180267334, + "step": 14980 + }, + { + "epoch": 0.98, + "learning_rate": 5.634041480218344e-09, + "logits/chosen": -2.4757115840911865, + "logits/rejected": -2.137505054473877, + "logps/chosen": -250.7669677734375, + "logps/rejected": -253.52487182617188, + "loss": 0.6897, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.023066259920597076, + "rewards/margins": 0.07198301702737808, + "rewards/rejected": -0.0489167645573616, + "step": 14990 + }, + { + "epoch": 0.98, + "learning_rate": 5.257423624260849e-09, + "logits/chosen": -2.520918369293213, + "logits/rejected": -2.0873606204986572, + "logps/chosen": -260.77056884765625, + "logps/rejected": -224.59854125976562, + "loss": 0.6925, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.025528425350785255, + "rewards/margins": 0.07071218639612198, + "rewards/rejected": -0.045183759182691574, + "step": 15000 + }, + { + "epoch": 0.98, + "eval_logits/chosen": -2.325486660003662, + "eval_logits/rejected": -2.136941432952881, + "eval_logps/chosen": -230.58958435058594, + "eval_logps/rejected": -218.62281799316406, + "eval_loss": 0.6897296905517578, + "eval_rewards/accuracies": 0.6384999752044678, + "eval_rewards/chosen": 0.014153635129332542, + "eval_rewards/margins": 0.08426273614168167, + "eval_rewards/rejected": -0.07010909914970398, + "eval_runtime": 709.7214, + "eval_samples_per_second": 2.818, + "eval_steps_per_second": 1.409, + "step": 15000 + }, + { + "epoch": 0.98, + "learning_rate": 4.893819567644564e-09, + "logits/chosen": -2.2012670040130615, + "logits/rejected": -2.1785387992858887, + "logps/chosen": -188.42910766601562, + "logps/rejected": -204.0313720703125, + "loss": 0.6902, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.004323553293943405, + "rewards/margins": 0.054262660443782806, + "rewards/rejected": -0.05858622118830681, + "step": 15010 + }, + { + "epoch": 0.98, + "learning_rate": 4.543231207107257e-09, + "logits/chosen": -2.2762811183929443, + "logits/rejected": -2.059688091278076, + "logps/chosen": -257.8048400878906, + "logps/rejected": -240.6076202392578, + "loss": 0.6926, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00016754865646362305, + "rewards/margins": 0.0641152560710907, + "rewards/rejected": -0.06394769996404648, + "step": 15020 + }, + { + "epoch": 0.98, + "learning_rate": 4.205660371488785e-09, + "logits/chosen": -2.5772321224212646, + "logits/rejected": -2.2152597904205322, + "logps/chosen": -282.9827575683594, + "logps/rejected": -245.66513061523438, + "loss": 0.6931, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.011705792509019375, + "rewards/margins": 0.04472345858812332, + "rewards/rejected": -0.03301766887307167, + "step": 15030 + }, + { + "epoch": 0.98, + "learning_rate": 3.88110882172471e-09, + "logits/chosen": -2.2456133365631104, + "logits/rejected": -2.1853480339050293, + "logps/chosen": -220.9130859375, + "logps/rejected": -221.7021026611328, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0021468736231327057, + "rewards/margins": 0.05425567179918289, + "rewards/rejected": -0.0564025342464447, + "step": 15040 + }, + { + "epoch": 0.98, + "learning_rate": 3.569578250834371e-09, + "logits/chosen": -2.3457345962524414, + "logits/rejected": -2.0774590969085693, + "logps/chosen": -298.05279541015625, + "logps/rejected": -271.5147705078125, + "loss": 0.6879, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03925010561943054, + "rewards/margins": 0.1122470498085022, + "rewards/rejected": -0.07299693673849106, + "step": 15050 + }, + { + "epoch": 0.99, + "learning_rate": 3.2710702839139353e-09, + "logits/chosen": -2.373473644256592, + "logits/rejected": -2.229529857635498, + "logps/chosen": -202.07412719726562, + "logps/rejected": -216.16708374023438, + "loss": 0.6918, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.025251392275094986, + "rewards/margins": 0.0481976643204689, + "rewards/rejected": -0.022946273908019066, + "step": 15060 + }, + { + "epoch": 0.99, + "learning_rate": 2.9855864781272448e-09, + "logits/chosen": -2.3376505374908447, + "logits/rejected": -2.3747403621673584, + "logps/chosen": -209.34140014648438, + "logps/rejected": -250.8092498779297, + "loss": 0.6905, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.023322973400354385, + "rewards/margins": 0.06673813611268997, + "rewards/rejected": -0.04341515898704529, + "step": 15070 + }, + { + "epoch": 0.99, + "learning_rate": 2.7131283226977665e-09, + "logits/chosen": -2.3406639099121094, + "logits/rejected": -2.4078879356384277, + "logps/chosen": -217.74288940429688, + "logps/rejected": -240.52072143554688, + "loss": 0.6901, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.017053885385394096, + "rewards/margins": 0.08927709609270096, + "rewards/rejected": -0.07222320139408112, + "step": 15080 + }, + { + "epoch": 0.99, + "learning_rate": 2.4536972389008205e-09, + "logits/chosen": -2.2868685722351074, + "logits/rejected": -2.058229446411133, + "logps/chosen": -233.2720489501953, + "logps/rejected": -207.40798950195312, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.038629334419965744, + "rewards/margins": 0.118269182741642, + "rewards/rejected": -0.07963985949754715, + "step": 15090 + }, + { + "epoch": 0.99, + "learning_rate": 2.20729458005553e-09, + "logits/chosen": -2.201646089553833, + "logits/rejected": -2.040139675140381, + "logps/chosen": -191.54600524902344, + "logps/rejected": -181.17190551757812, + "loss": 0.6882, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.028632348403334618, + "rewards/margins": 0.11827573925256729, + "rewards/rejected": -0.08964338153600693, + "step": 15100 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -2.3254594802856445, + "eval_logits/rejected": -2.1368985176086426, + "eval_logps/chosen": -230.5937042236328, + "eval_logps/rejected": -218.6256866455078, + "eval_loss": 0.6897294521331787, + "eval_rewards/accuracies": 0.6389999985694885, + "eval_rewards/chosen": 0.014112350530922413, + "eval_rewards/margins": 0.08425014466047287, + "eval_rewards/rejected": -0.07013778388500214, + "eval_runtime": 710.0809, + "eval_samples_per_second": 2.817, + "eval_steps_per_second": 1.408, + "step": 15100 + }, + { + "epoch": 0.99, + "learning_rate": 1.9739216315192712e-09, + "logits/chosen": -2.3063950538635254, + "logits/rejected": -2.096644163131714, + "logps/chosen": -227.7020263671875, + "logps/rejected": -210.0640106201172, + "loss": 0.6913, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.016067935153841972, + "rewards/margins": 0.06059398502111435, + "rewards/rejected": -0.044526055455207825, + "step": 15110 + }, + { + "epoch": 0.99, + "learning_rate": 1.7535796106796231e-09, + "logits/chosen": -2.3938944339752197, + "logits/rejected": -2.072892665863037, + "logps/chosen": -276.0992431640625, + "logps/rejected": -202.15260314941406, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005628110375255346, + "rewards/margins": 0.06384317576885223, + "rewards/rejected": -0.06947128474712372, + "step": 15120 + }, + { + "epoch": 0.99, + "learning_rate": 1.5462696669482636e-09, + "logits/chosen": -2.359903335571289, + "logits/rejected": -2.2396528720855713, + "logps/chosen": -219.18911743164062, + "logps/rejected": -233.8794403076172, + "loss": 0.6885, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.020530302077531815, + "rewards/margins": 0.07046308368444443, + "rewards/rejected": -0.04993278905749321, + "step": 15130 + }, + { + "epoch": 0.99, + "learning_rate": 1.3519928817556927e-09, + "logits/chosen": -2.21510648727417, + "logits/rejected": -2.1697449684143066, + "logps/chosen": -171.1583251953125, + "logps/rejected": -178.51443481445312, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02235172688961029, + "rewards/margins": 0.0680917277932167, + "rewards/rejected": -0.045739997178316116, + "step": 15140 + }, + { + "epoch": 0.99, + "learning_rate": 1.1707502685448512e-09, + "logits/chosen": -2.445695638656616, + "logits/rejected": -2.1078381538391113, + "logps/chosen": -222.18508911132812, + "logps/rejected": -176.2418975830078, + "loss": 0.6863, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0008492677588947117, + "rewards/margins": 0.12245283275842667, + "rewards/rejected": -0.12160356342792511, + "step": 15150 + }, + { + "epoch": 0.99, + "learning_rate": 1.002542772765569e-09, + "logits/chosen": -2.2757375240325928, + "logits/rejected": -1.9920152425765991, + "logps/chosen": -187.7380828857422, + "logps/rejected": -158.5767059326172, + "loss": 0.6893, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.008657276630401611, + "rewards/margins": 0.09146241843700409, + "rewards/rejected": -0.08280514925718307, + "step": 15160 + }, + { + "epoch": 0.99, + "learning_rate": 8.473712718709559e-10, + "logits/chosen": -2.1393208503723145, + "logits/rejected": -2.1515212059020996, + "logps/chosen": -193.9955596923828, + "logps/rejected": -187.0467987060547, + "loss": 0.6926, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.002506362274289131, + "rewards/margins": 0.03829359635710716, + "rewards/rejected": -0.040799956768751144, + "step": 15170 + }, + { + "epoch": 0.99, + "learning_rate": 7.052365753112966e-10, + "logits/chosen": -2.1680989265441895, + "logits/rejected": -1.8749319314956665, + "logps/chosen": -234.80142211914062, + "logps/rejected": -226.61306762695312, + "loss": 0.6895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0051465281285345554, + "rewards/margins": 0.10587289184331894, + "rewards/rejected": -0.10072635114192963, + "step": 15180 + }, + { + "epoch": 0.99, + "learning_rate": 5.761394245307195e-10, + "logits/chosen": -2.155925989151001, + "logits/rejected": -2.18461275100708, + "logps/chosen": -235.2099609375, + "logps/rejected": -238.4646759033203, + "loss": 0.692, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.018722042441368103, + "rewards/margins": 0.04466164484620094, + "rewards/rejected": -0.02593959867954254, + "step": 15190 + }, + { + "epoch": 0.99, + "learning_rate": 4.6008049296358826e-10, + "logits/chosen": -2.229979991912842, + "logits/rejected": -2.126420736312866, + "logps/chosen": -185.88441467285156, + "logps/rejected": -165.7515411376953, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0013815786223858595, + "rewards/margins": 0.08110791444778442, + "rewards/rejected": -0.07972635328769684, + "step": 15200 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -2.325113296508789, + "eval_logits/rejected": -2.1365654468536377, + "eval_logps/chosen": -230.59994506835938, + "eval_logps/rejected": -218.62445068359375, + "eval_loss": 0.6897304654121399, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": 0.014050180092453957, + "eval_rewards/margins": 0.08417567610740662, + "eval_rewards/rejected": -0.07012549042701721, + "eval_runtime": 709.6375, + "eval_samples_per_second": 2.818, + "eval_steps_per_second": 1.409, + "step": 15200 + }, + { + "epoch": 1.0, + "learning_rate": 3.5706038603006146e-10, + "logits/chosen": -2.4180667400360107, + "logits/rejected": -2.3709511756896973, + "logps/chosen": -284.6794128417969, + "logps/rejected": -279.5739440917969, + "loss": 0.6909, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04930936545133591, + "rewards/margins": 0.09137637913227081, + "rewards/rejected": -0.04206700250506401, + "step": 15210 + }, + { + "epoch": 1.0, + "learning_rate": 2.670796411333165e-10, + "logits/chosen": -2.588311195373535, + "logits/rejected": -2.297461986541748, + "logps/chosen": -219.0515594482422, + "logps/rejected": -215.50772094726562, + "loss": 0.6888, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.00759897381067276, + "rewards/margins": 0.08762288093566895, + "rewards/rejected": -0.08002390712499619, + "step": 15220 + }, + { + "epoch": 1.0, + "learning_rate": 1.9013872765677455e-10, + "logits/chosen": -2.3167214393615723, + "logits/rejected": -2.108008861541748, + "logps/chosen": -216.2996368408203, + "logps/rejected": -207.8689422607422, + "loss": 0.6923, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03213484585285187, + "rewards/margins": 0.04103900119662285, + "rewards/rejected": -0.008904160931706429, + "step": 15230 + }, + { + "epoch": 1.0, + "learning_rate": 1.262380469624347e-10, + "logits/chosen": -2.2934911251068115, + "logits/rejected": -2.121281862258911, + "logps/chosen": -202.36239624023438, + "logps/rejected": -185.4467010498047, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01644532009959221, + "rewards/margins": 0.05994013696908951, + "rewards/rejected": -0.0434948205947876, + "step": 15240 + }, + { + "epoch": 1.0, + "learning_rate": 7.53779323872661e-11, + "logits/chosen": -2.1865756511688232, + "logits/rejected": -2.289961576461792, + "logps/chosen": -190.6034698486328, + "logps/rejected": -205.8804931640625, + "loss": 0.6886, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.012599390931427479, + "rewards/margins": 0.09233128279447556, + "rewards/rejected": -0.07973189651966095, + "step": 15250 + }, + { + "epoch": 1.0, + "learning_rate": 3.7558649242652734e-11, + "logits/chosen": -2.4936611652374268, + "logits/rejected": -2.2574238777160645, + "logps/chosen": -395.0047912597656, + "logps/rejected": -322.8507995605469, + "loss": 0.6908, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.016790423542261124, + "rewards/margins": 0.07416915148496628, + "rewards/rejected": -0.05737873911857605, + "step": 15260 + }, + { + "epoch": 1.0, + "learning_rate": 1.2780394812450526e-11, + "logits/chosen": -2.1327195167541504, + "logits/rejected": -2.085716724395752, + "logps/chosen": -233.3424072265625, + "logps/rejected": -242.1055145263672, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011986208148300648, + "rewards/margins": 0.08845750242471695, + "rewards/rejected": -0.10044372081756592, + "step": 15270 + }, + { + "epoch": 1.0, + "learning_rate": 1.0432983521546646e-12, + "logits/chosen": -2.170581817626953, + "logits/rejected": -2.0781712532043457, + "logps/chosen": -182.6875, + "logps/rejected": -211.4102325439453, + "loss": 0.6886, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.007787027861922979, + "rewards/margins": 0.09877107292413712, + "rewards/rejected": -0.0909840390086174, + "step": 15280 + }, + { + "epoch": 1.0, + "step": 15284, + "total_flos": 0.0, + "train_loss": 0.6900739747015976, + "train_runtime": 171639.7836, + "train_samples_per_second": 0.356, + "train_steps_per_second": 0.089 + } + ], + "logging_steps": 10, + "max_steps": 15284, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}