{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999509282734931, "eval_steps": 100, "global_step": 15283, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.21875, "learning_rate": 3.270111183780249e-09, "logits/chosen": -2.3714964389801025, "logits/rejected": -2.5240108966827393, "logps/chosen": -259.21142578125, "logps/rejected": -96.47927856445312, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 4.65625, "learning_rate": 3.270111183780249e-08, "logits/chosen": -2.871506690979004, "logits/rejected": -2.8574600219726562, "logps/chosen": -304.3540344238281, "logps/rejected": -200.63580322265625, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.0002421797253191471, "rewards/margins": 0.0005761772044934332, "rewards/rejected": -0.0008183569880202413, "step": 10 }, { "epoch": 0.0, "grad_norm": 3.625, "learning_rate": 6.540222367560497e-08, "logits/chosen": -2.8375298976898193, "logits/rejected": -2.835369348526001, "logps/chosen": -249.16098022460938, "logps/rejected": -206.52462768554688, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0003982210182584822, "rewards/margins": 0.00023810142010916024, "rewards/rejected": 0.0001601195108378306, "step": 20 }, { "epoch": 0.0, "grad_norm": 4.78125, "learning_rate": 9.810333551340746e-08, "logits/chosen": -2.897075653076172, "logits/rejected": -2.994014263153076, "logps/chosen": -255.46939086914062, "logps/rejected": -222.4738311767578, "loss": 0.6935, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0006572299171239138, "rewards/margins": -0.000688147556502372, "rewards/rejected": 0.0013453774154186249, "step": 30 }, { "epoch": 0.0, "grad_norm": 4.5625, "learning_rate": 1.3080444735120995e-07, "logits/chosen": -2.8021187782287598, "logits/rejected": -2.768587350845337, "logps/chosen": -221.78329467773438, "logps/rejected": -209.73043823242188, "loss": 0.6934, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0007872491260059178, "rewards/margins": -0.0005158445565029979, "rewards/rejected": 0.0013030937407165766, "step": 40 }, { "epoch": 0.0, "grad_norm": 5.28125, "learning_rate": 1.6350555918901243e-07, "logits/chosen": -2.9382517337799072, "logits/rejected": -2.8852016925811768, "logps/chosen": -252.9862518310547, "logps/rejected": -212.0840301513672, "loss": 0.6928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0023783831857144833, "rewards/margins": 0.0007520966464653611, "rewards/rejected": 0.0016262864228338003, "step": 50 }, { "epoch": 0.0, "grad_norm": 4.9375, "learning_rate": 1.9620667102681492e-07, "logits/chosen": -2.8755006790161133, "logits/rejected": -2.882744312286377, "logps/chosen": -200.30984497070312, "logps/rejected": -239.2034149169922, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0032703480683267117, "rewards/margins": 0.0002277972671436146, "rewards/rejected": 0.0030425512231886387, "step": 60 }, { "epoch": 0.0, "grad_norm": 4.53125, "learning_rate": 2.289077828646174e-07, "logits/chosen": -2.8780760765075684, "logits/rejected": -2.8211326599121094, "logps/chosen": -248.4451446533203, "logps/rejected": -208.0590362548828, "loss": 0.6915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.008318725042045116, "rewards/margins": 0.0032786305528134108, "rewards/rejected": 0.005040094722062349, "step": 70 }, { "epoch": 0.01, "grad_norm": 4.59375, "learning_rate": 2.616088947024199e-07, "logits/chosen": -3.063352584838867, "logits/rejected": -3.05415678024292, "logps/chosen": -271.71221923828125, "logps/rejected": -386.14666748046875, "loss": 0.6919, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.014423075132071972, "rewards/margins": 0.0025670293252915144, "rewards/rejected": 0.011856046505272388, "step": 80 }, { "epoch": 0.01, "grad_norm": 3.796875, "learning_rate": 2.943100065402224e-07, "logits/chosen": -2.9700839519500732, "logits/rejected": -2.986363649368286, "logps/chosen": -275.89312744140625, "logps/rejected": -267.6192626953125, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.014839875511825085, "rewards/margins": 0.00039755358011461794, "rewards/rejected": 0.014442321844398975, "step": 90 }, { "epoch": 0.01, "grad_norm": 4.90625, "learning_rate": 3.2701111837802487e-07, "logits/chosen": -2.9539151191711426, "logits/rejected": -2.908043384552002, "logps/chosen": -260.79583740234375, "logps/rejected": -195.0166473388672, "loss": 0.6914, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.017197085544466972, "rewards/margins": 0.003485057968646288, "rewards/rejected": 0.013712028041481972, "step": 100 }, { "epoch": 0.01, "eval_logits/chosen": -2.932523012161255, "eval_logits/rejected": -2.9204440116882324, "eval_logps/chosen": -262.6278991699219, "eval_logps/rejected": -243.00701904296875, "eval_loss": 0.6910374164581299, "eval_rewards/accuracies": 0.621999979019165, "eval_rewards/chosen": 0.019916877150535583, "eval_rewards/margins": 0.004270330537110567, "eval_rewards/rejected": 0.015646545216441154, "eval_runtime": 1083.6952, "eval_samples_per_second": 1.846, "eval_steps_per_second": 1.846, "step": 100 }, { "epoch": 0.01, "grad_norm": 4.5, "learning_rate": 3.5971223021582736e-07, "logits/chosen": -2.7373836040496826, "logits/rejected": -2.6748037338256836, "logps/chosen": -268.8216857910156, "logps/rejected": -225.33969116210938, "loss": 0.6914, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.02248796820640564, "rewards/margins": 0.0036007389426231384, "rewards/rejected": 0.01888723112642765, "step": 110 }, { "epoch": 0.01, "grad_norm": 5.03125, "learning_rate": 3.9241334205362984e-07, "logits/chosen": -3.0358119010925293, "logits/rejected": -3.0025687217712402, "logps/chosen": -338.47662353515625, "logps/rejected": -336.485107421875, "loss": 0.6911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02893940731883049, "rewards/margins": 0.004074341617524624, "rewards/rejected": 0.02486506476998329, "step": 120 }, { "epoch": 0.01, "grad_norm": 4.9375, "learning_rate": 4.251144538914324e-07, "logits/chosen": -2.873260974884033, "logits/rejected": -2.9258310794830322, "logps/chosen": -295.185546875, "logps/rejected": -280.46087646484375, "loss": 0.689, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.029246967285871506, "rewards/margins": 0.008405114524066448, "rewards/rejected": 0.020841851830482483, "step": 130 }, { "epoch": 0.01, "grad_norm": 4.125, "learning_rate": 4.578155657292348e-07, "logits/chosen": -2.873358964920044, "logits/rejected": -2.827579975128174, "logps/chosen": -252.87808227539062, "logps/rejected": -243.4793701171875, "loss": 0.6872, "rewards/accuracies": 0.75, "rewards/chosen": 0.03245113044977188, "rewards/margins": 0.012014606036245823, "rewards/rejected": 0.020436523482203484, "step": 140 }, { "epoch": 0.01, "grad_norm": 3.8125, "learning_rate": 4.905166775670374e-07, "logits/chosen": -2.8678393363952637, "logits/rejected": -2.765681028366089, "logps/chosen": -288.5406494140625, "logps/rejected": -249.72433471679688, "loss": 0.6857, "rewards/accuracies": 0.75, "rewards/chosen": 0.04014243930578232, "rewards/margins": 0.015096647664904594, "rewards/rejected": 0.025045791640877724, "step": 150 }, { "epoch": 0.01, "grad_norm": 4.78125, "learning_rate": 5.232177894048398e-07, "logits/chosen": -3.0254225730895996, "logits/rejected": -3.034255027770996, "logps/chosen": -272.1219177246094, "logps/rejected": -249.5802764892578, "loss": 0.6888, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03549903631210327, "rewards/margins": 0.008817009627819061, "rewards/rejected": 0.02668202482163906, "step": 160 }, { "epoch": 0.01, "grad_norm": 4.53125, "learning_rate": 5.559189012426422e-07, "logits/chosen": -2.9497790336608887, "logits/rejected": -2.9547977447509766, "logps/chosen": -254.0339813232422, "logps/rejected": -262.98468017578125, "loss": 0.69, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.043996766209602356, "rewards/margins": 0.00637472327798605, "rewards/rejected": 0.03762204200029373, "step": 170 }, { "epoch": 0.01, "grad_norm": 5.03125, "learning_rate": 5.886200130804448e-07, "logits/chosen": -2.9221205711364746, "logits/rejected": -2.8373873233795166, "logps/chosen": -274.3648376464844, "logps/rejected": -204.6705322265625, "loss": 0.689, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.03732004761695862, "rewards/margins": 0.008404549211263657, "rewards/rejected": 0.02891550026834011, "step": 180 }, { "epoch": 0.01, "grad_norm": 3.53125, "learning_rate": 6.213211249182473e-07, "logits/chosen": -3.010490894317627, "logits/rejected": -2.941779375076294, "logps/chosen": -250.18344116210938, "logps/rejected": -213.8663787841797, "loss": 0.688, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04371601343154907, "rewards/margins": 0.010541839525103569, "rewards/rejected": 0.03317417949438095, "step": 190 }, { "epoch": 0.01, "grad_norm": 4.75, "learning_rate": 6.540222367560497e-07, "logits/chosen": -2.9971470832824707, "logits/rejected": -3.0381886959075928, "logps/chosen": -279.80523681640625, "logps/rejected": -233.8086700439453, "loss": 0.6877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04553839564323425, "rewards/margins": 0.011376335285604, "rewards/rejected": 0.03416206315159798, "step": 200 }, { "epoch": 0.01, "eval_logits/chosen": -2.935272216796875, "eval_logits/rejected": -2.920989513397217, "eval_logps/chosen": -260.1324768066406, "eval_logps/rejected": -241.3638916015625, "eval_loss": 0.6869379878044128, "eval_rewards/accuracies": 0.6255000233650208, "eval_rewards/chosen": 0.04487115517258644, "eval_rewards/margins": 0.012793445028364658, "eval_rewards/rejected": 0.03207770735025406, "eval_runtime": 1083.5, "eval_samples_per_second": 1.846, "eval_steps_per_second": 1.846, "step": 200 }, { "epoch": 0.01, "grad_norm": 4.65625, "learning_rate": 6.867233485938523e-07, "logits/chosen": -2.9325437545776367, "logits/rejected": -2.8707199096679688, "logps/chosen": -258.85528564453125, "logps/rejected": -232.4899444580078, "loss": 0.6861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04634648934006691, "rewards/margins": 0.01442038081586361, "rewards/rejected": 0.03192610666155815, "step": 210 }, { "epoch": 0.01, "grad_norm": 3.546875, "learning_rate": 7.194244604316547e-07, "logits/chosen": -2.811739444732666, "logits/rejected": -2.6866989135742188, "logps/chosen": -225.83352661132812, "logps/rejected": -211.3502655029297, "loss": 0.6854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04393581673502922, "rewards/margins": 0.01593783125281334, "rewards/rejected": 0.02799798548221588, "step": 220 }, { "epoch": 0.02, "grad_norm": 4.5, "learning_rate": 7.521255722694571e-07, "logits/chosen": -3.0251004695892334, "logits/rejected": -2.9951930046081543, "logps/chosen": -317.44622802734375, "logps/rejected": -242.2830810546875, "loss": 0.6846, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.05557548254728317, "rewards/margins": 0.017802467569708824, "rewards/rejected": 0.0377730093896389, "step": 230 }, { "epoch": 0.02, "grad_norm": 6.3125, "learning_rate": 7.848266841072597e-07, "logits/chosen": -3.0709195137023926, "logits/rejected": -3.0321555137634277, "logps/chosen": -290.70233154296875, "logps/rejected": -261.47039794921875, "loss": 0.6819, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.05848951265215874, "rewards/margins": 0.023060956969857216, "rewards/rejected": 0.03542856127023697, "step": 240 }, { "epoch": 0.02, "grad_norm": 4.9375, "learning_rate": 8.175277959450622e-07, "logits/chosen": -2.9508426189422607, "logits/rejected": -2.947136402130127, "logps/chosen": -313.30999755859375, "logps/rejected": -262.879638671875, "loss": 0.682, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.05648805946111679, "rewards/margins": 0.023289453238248825, "rewards/rejected": 0.033198606222867966, "step": 250 }, { "epoch": 0.02, "grad_norm": 6.59375, "learning_rate": 8.502289077828648e-07, "logits/chosen": -2.920016288757324, "logits/rejected": -2.860016107559204, "logps/chosen": -306.9877014160156, "logps/rejected": -247.0545196533203, "loss": 0.682, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.05868823081254959, "rewards/margins": 0.02262522466480732, "rewards/rejected": 0.036063000559806824, "step": 260 }, { "epoch": 0.02, "grad_norm": 5.3125, "learning_rate": 8.829300196206672e-07, "logits/chosen": -3.000159978866577, "logits/rejected": -2.9538631439208984, "logps/chosen": -216.26303100585938, "logps/rejected": -255.2703857421875, "loss": 0.6882, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04876617342233658, "rewards/margins": 0.010648338124155998, "rewards/rejected": 0.03811783716082573, "step": 270 }, { "epoch": 0.02, "grad_norm": 4.25, "learning_rate": 9.156311314584696e-07, "logits/chosen": -2.925166606903076, "logits/rejected": -2.8267123699188232, "logps/chosen": -258.9403381347656, "logps/rejected": -215.9775848388672, "loss": 0.6825, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05243561789393425, "rewards/margins": 0.022278418764472008, "rewards/rejected": 0.030157197266817093, "step": 280 }, { "epoch": 0.02, "grad_norm": 2.90625, "learning_rate": 9.483322432962722e-07, "logits/chosen": -3.0200672149658203, "logits/rejected": -2.96281099319458, "logps/chosen": -306.41943359375, "logps/rejected": -261.789794921875, "loss": 0.6802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.056604497134685516, "rewards/margins": 0.027083929628133774, "rewards/rejected": 0.029520561918616295, "step": 290 }, { "epoch": 0.02, "grad_norm": 4.71875, "learning_rate": 9.810333551340747e-07, "logits/chosen": -2.8969147205352783, "logits/rejected": -2.842417001724243, "logps/chosen": -198.95236206054688, "logps/rejected": -199.50930786132812, "loss": 0.6841, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.04748619347810745, "rewards/margins": 0.01876116544008255, "rewards/rejected": 0.0287250317633152, "step": 300 }, { "epoch": 0.02, "eval_logits/chosen": -2.9327452182769775, "eval_logits/rejected": -2.9183156490325928, "eval_logps/chosen": -258.8525085449219, "eval_logps/rejected": -241.50799560546875, "eval_loss": 0.6803670525550842, "eval_rewards/accuracies": 0.6495000123977661, "eval_rewards/chosen": 0.057670701295137405, "eval_rewards/margins": 0.027033928781747818, "eval_rewards/rejected": 0.03063676878809929, "eval_runtime": 1081.8943, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 300 }, { "epoch": 0.02, "grad_norm": 4.9375, "learning_rate": 1.0137344669718771e-06, "logits/chosen": -2.9696617126464844, "logits/rejected": -3.028501510620117, "logps/chosen": -229.685546875, "logps/rejected": -238.64599609375, "loss": 0.6839, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.056942105293273926, "rewards/margins": 0.019650690257549286, "rewards/rejected": 0.03729141503572464, "step": 310 }, { "epoch": 0.02, "grad_norm": 6.03125, "learning_rate": 1.0464355788096796e-06, "logits/chosen": -3.035881519317627, "logits/rejected": -2.908747911453247, "logps/chosen": -270.2947998046875, "logps/rejected": -203.93637084960938, "loss": 0.6792, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05873095244169235, "rewards/margins": 0.03004259243607521, "rewards/rejected": 0.02868836559355259, "step": 320 }, { "epoch": 0.02, "grad_norm": 5.1875, "learning_rate": 1.079136690647482e-06, "logits/chosen": -2.905421495437622, "logits/rejected": -2.9303717613220215, "logps/chosen": -278.3734130859375, "logps/rejected": -231.7566375732422, "loss": 0.6817, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.05422138422727585, "rewards/margins": 0.024412620812654495, "rewards/rejected": 0.029808763414621353, "step": 330 }, { "epoch": 0.02, "grad_norm": 4.5, "learning_rate": 1.1118378024852844e-06, "logits/chosen": -2.9940247535705566, "logits/rejected": -2.9909911155700684, "logps/chosen": -251.1647186279297, "logps/rejected": -243.7392120361328, "loss": 0.6755, "rewards/accuracies": 0.625, "rewards/chosen": 0.060316819697618484, "rewards/margins": 0.03732689842581749, "rewards/rejected": 0.022989919409155846, "step": 340 }, { "epoch": 0.02, "grad_norm": 4.53125, "learning_rate": 1.144538914323087e-06, "logits/chosen": -2.965489625930786, "logits/rejected": -2.930488348007202, "logps/chosen": -281.09161376953125, "logps/rejected": -235.97427368164062, "loss": 0.6666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.057181715965270996, "rewards/margins": 0.05733555555343628, "rewards/rejected": -0.0001538395881652832, "step": 350 }, { "epoch": 0.02, "grad_norm": 5.0, "learning_rate": 1.1772400261608895e-06, "logits/chosen": -2.8775887489318848, "logits/rejected": -2.8314852714538574, "logps/chosen": -226.9202117919922, "logps/rejected": -245.89688110351562, "loss": 0.6845, "rewards/accuracies": 0.625, "rewards/chosen": 0.03828488662838936, "rewards/margins": 0.020719682797789574, "rewards/rejected": 0.017565207555890083, "step": 360 }, { "epoch": 0.02, "grad_norm": 4.09375, "learning_rate": 1.2099411379986922e-06, "logits/chosen": -2.924715042114258, "logits/rejected": -2.859647750854492, "logps/chosen": -266.81884765625, "logps/rejected": -218.14755249023438, "loss": 0.6713, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.06819824129343033, "rewards/margins": 0.045499593019485474, "rewards/rejected": 0.02269863896071911, "step": 370 }, { "epoch": 0.02, "grad_norm": 4.8125, "learning_rate": 1.2426422498364946e-06, "logits/chosen": -2.965376138687134, "logits/rejected": -2.930497407913208, "logps/chosen": -252.59194946289062, "logps/rejected": -249.0243377685547, "loss": 0.6718, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06902029365301132, "rewards/margins": 0.045716192573308945, "rewards/rejected": 0.023304106667637825, "step": 380 }, { "epoch": 0.03, "grad_norm": 3.25, "learning_rate": 1.2753433616742968e-06, "logits/chosen": -2.871094226837158, "logits/rejected": -2.857203960418701, "logps/chosen": -256.82305908203125, "logps/rejected": -260.06719970703125, "loss": 0.6815, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.05356285721063614, "rewards/margins": 0.025547217577695847, "rewards/rejected": 0.02801564335823059, "step": 390 }, { "epoch": 0.03, "grad_norm": 4.53125, "learning_rate": 1.3080444735120995e-06, "logits/chosen": -2.8761749267578125, "logits/rejected": -2.873929262161255, "logps/chosen": -200.96322631835938, "logps/rejected": -228.7638702392578, "loss": 0.6737, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.041652075946331024, "rewards/margins": 0.04071967303752899, "rewards/rejected": 0.0009324053535237908, "step": 400 }, { "epoch": 0.03, "eval_logits/chosen": -2.9118146896362305, "eval_logits/rejected": -2.896183729171753, "eval_logps/chosen": -259.80767822265625, "eval_logps/rejected": -244.57443237304688, "eval_loss": 0.6713329553604126, "eval_rewards/accuracies": 0.6549999713897705, "eval_rewards/chosen": 0.04811927676200867, "eval_rewards/margins": 0.04814711958169937, "eval_rewards/rejected": -2.7838468668051064e-05, "eval_runtime": 1082.885, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 400 }, { "epoch": 0.03, "grad_norm": 5.25, "learning_rate": 1.3407455853499021e-06, "logits/chosen": -2.965993881225586, "logits/rejected": -2.8793087005615234, "logps/chosen": -312.7399597167969, "logps/rejected": -297.9888916015625, "loss": 0.6545, "rewards/accuracies": 0.625, "rewards/chosen": 0.07325529307126999, "rewards/margins": 0.0822000727057457, "rewards/rejected": -0.008944791741669178, "step": 410 }, { "epoch": 0.03, "grad_norm": 4.125, "learning_rate": 1.3734466971877046e-06, "logits/chosen": -2.7868053913116455, "logits/rejected": -2.7868714332580566, "logps/chosen": -206.22634887695312, "logps/rejected": -238.8357391357422, "loss": 0.6771, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006576336920261383, "rewards/margins": 0.03482916206121445, "rewards/rejected": -0.04140549898147583, "step": 420 }, { "epoch": 0.03, "grad_norm": 4.3125, "learning_rate": 1.406147809025507e-06, "logits/chosen": -2.900200128555298, "logits/rejected": -2.9543025493621826, "logps/chosen": -241.38558959960938, "logps/rejected": -218.28988647460938, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": 0.036901749670505524, "rewards/margins": 0.06990204751491547, "rewards/rejected": -0.03300030902028084, "step": 430 }, { "epoch": 0.03, "grad_norm": 4.53125, "learning_rate": 1.4388489208633094e-06, "logits/chosen": -2.8053104877471924, "logits/rejected": -2.77720308303833, "logps/chosen": -190.7186737060547, "logps/rejected": -205.59109497070312, "loss": 0.6637, "rewards/accuracies": 0.75, "rewards/chosen": 0.02005600929260254, "rewards/margins": 0.06334303319454193, "rewards/rejected": -0.043287020176649094, "step": 440 }, { "epoch": 0.03, "grad_norm": 4.5, "learning_rate": 1.471550032701112e-06, "logits/chosen": -3.1083502769470215, "logits/rejected": -3.008763074874878, "logps/chosen": -335.1595764160156, "logps/rejected": -281.5724182128906, "loss": 0.6546, "rewards/accuracies": 0.625, "rewards/chosen": 0.04983304440975189, "rewards/margins": 0.087971530854702, "rewards/rejected": -0.0381384901702404, "step": 450 }, { "epoch": 0.03, "grad_norm": 4.78125, "learning_rate": 1.5042511445389143e-06, "logits/chosen": -2.9354960918426514, "logits/rejected": -2.9576497077941895, "logps/chosen": -330.7381896972656, "logps/rejected": -258.47650146484375, "loss": 0.6592, "rewards/accuracies": 0.625, "rewards/chosen": 0.04436716437339783, "rewards/margins": 0.07494419068098068, "rewards/rejected": -0.030577033758163452, "step": 460 }, { "epoch": 0.03, "grad_norm": 7.03125, "learning_rate": 1.536952256376717e-06, "logits/chosen": -2.843052387237549, "logits/rejected": -2.796456813812256, "logps/chosen": -224.9968719482422, "logps/rejected": -194.6265411376953, "loss": 0.6372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.005335536785423756, "rewards/margins": 0.12108583748340607, "rewards/rejected": -0.12642137706279755, "step": 470 }, { "epoch": 0.03, "grad_norm": 7.8125, "learning_rate": 1.5696533682145194e-06, "logits/chosen": -2.967621326446533, "logits/rejected": -2.890089511871338, "logps/chosen": -282.5645751953125, "logps/rejected": -211.98013305664062, "loss": 0.6363, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.030698075890541077, "rewards/margins": 0.13012480735778809, "rewards/rejected": -0.09942670911550522, "step": 480 }, { "epoch": 0.03, "grad_norm": 5.40625, "learning_rate": 1.602354480052322e-06, "logits/chosen": -2.8703322410583496, "logits/rejected": -2.799984931945801, "logps/chosen": -301.16851806640625, "logps/rejected": -229.3596649169922, "loss": 0.6317, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03850791975855827, "rewards/margins": 0.1353929191827774, "rewards/rejected": -0.09688499569892883, "step": 490 }, { "epoch": 0.03, "grad_norm": 8.375, "learning_rate": 1.6350555918901245e-06, "logits/chosen": -2.9017393589019775, "logits/rejected": -2.937171459197998, "logps/chosen": -253.47012329101562, "logps/rejected": -232.04702758789062, "loss": 0.6443, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0018582321936264634, "rewards/margins": 0.11067748069763184, "rewards/rejected": -0.11253571510314941, "step": 500 }, { "epoch": 0.03, "eval_logits/chosen": -2.872239351272583, "eval_logits/rejected": -2.8543825149536133, "eval_logps/chosen": -273.21099853515625, "eval_logps/rejected": -262.4491882324219, "eval_loss": 0.6546598672866821, "eval_rewards/accuracies": 0.6725000143051147, "eval_rewards/chosen": -0.0859142392873764, "eval_rewards/margins": 0.09286098182201385, "eval_rewards/rejected": -0.17877522110939026, "eval_runtime": 1082.5674, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 500 }, { "epoch": 0.03, "grad_norm": 7.125, "learning_rate": 1.6677567037279269e-06, "logits/chosen": -2.8242688179016113, "logits/rejected": -2.736645460128784, "logps/chosen": -252.6578369140625, "logps/rejected": -258.61773681640625, "loss": 0.652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09454967826604843, "rewards/margins": 0.09924690425395966, "rewards/rejected": -0.19379660487174988, "step": 510 }, { "epoch": 0.03, "grad_norm": 8.1875, "learning_rate": 1.7004578155657295e-06, "logits/chosen": -2.839492082595825, "logits/rejected": -2.8035099506378174, "logps/chosen": -304.8105773925781, "logps/rejected": -246.3205108642578, "loss": 0.651, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03453657776117325, "rewards/margins": 0.10316568613052368, "rewards/rejected": -0.13770225644111633, "step": 520 }, { "epoch": 0.03, "grad_norm": 8.9375, "learning_rate": 1.7331589274035318e-06, "logits/chosen": -2.869872808456421, "logits/rejected": -2.844622850418091, "logps/chosen": -218.54232788085938, "logps/rejected": -218.81689453125, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": -0.1467592567205429, "rewards/margins": 0.07301714271306992, "rewards/rejected": -0.21977639198303223, "step": 530 }, { "epoch": 0.04, "grad_norm": 4.6875, "learning_rate": 1.7658600392413344e-06, "logits/chosen": -2.7943673133850098, "logits/rejected": -2.770386219024658, "logps/chosen": -260.6991271972656, "logps/rejected": -291.608154296875, "loss": 0.6358, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04231525585055351, "rewards/margins": 0.14172054827213287, "rewards/rejected": -0.1840357929468155, "step": 540 }, { "epoch": 0.04, "grad_norm": 7.09375, "learning_rate": 1.7985611510791368e-06, "logits/chosen": -2.690777063369751, "logits/rejected": -2.7107791900634766, "logps/chosen": -213.8583526611328, "logps/rejected": -228.35769653320312, "loss": 0.6455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.01720379665493965, "rewards/margins": 0.1183861494064331, "rewards/rejected": -0.13558992743492126, "step": 550 }, { "epoch": 0.04, "grad_norm": 7.15625, "learning_rate": 1.8312622629169393e-06, "logits/chosen": -2.837252616882324, "logits/rejected": -2.8389735221862793, "logps/chosen": -263.4803771972656, "logps/rejected": -263.3722839355469, "loss": 0.6643, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04863029345870018, "rewards/margins": 0.081067755818367, "rewards/rejected": -0.1296980232000351, "step": 560 }, { "epoch": 0.04, "grad_norm": 6.46875, "learning_rate": 1.8639633747547417e-06, "logits/chosen": -2.865067958831787, "logits/rejected": -2.8662147521972656, "logps/chosen": -299.93896484375, "logps/rejected": -291.2447814941406, "loss": 0.6436, "rewards/accuracies": 0.625, "rewards/chosen": -0.08727128803730011, "rewards/margins": 0.13329127430915833, "rewards/rejected": -0.22056254744529724, "step": 570 }, { "epoch": 0.04, "grad_norm": 5.5625, "learning_rate": 1.8966644865925443e-06, "logits/chosen": -2.863518714904785, "logits/rejected": -2.739290714263916, "logps/chosen": -302.12994384765625, "logps/rejected": -267.7052307128906, "loss": 0.6498, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0654161274433136, "rewards/margins": 0.10930512100458145, "rewards/rejected": -0.17472124099731445, "step": 580 }, { "epoch": 0.04, "grad_norm": 18.75, "learning_rate": 1.9293655984303466e-06, "logits/chosen": -2.9441792964935303, "logits/rejected": -2.8443198204040527, "logps/chosen": -297.69061279296875, "logps/rejected": -300.2025146484375, "loss": 0.6615, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09488805383443832, "rewards/margins": 0.09199028462171555, "rewards/rejected": -0.18687832355499268, "step": 590 }, { "epoch": 0.04, "grad_norm": 8.3125, "learning_rate": 1.9620667102681494e-06, "logits/chosen": -2.683650493621826, "logits/rejected": -2.6535439491271973, "logps/chosen": -253.7371368408203, "logps/rejected": -258.8272399902344, "loss": 0.6257, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10700225830078125, "rewards/margins": 0.15831954777240753, "rewards/rejected": -0.2653217911720276, "step": 600 }, { "epoch": 0.04, "eval_logits/chosen": -2.843611001968384, "eval_logits/rejected": -2.8252251148223877, "eval_logps/chosen": -278.71319580078125, "eval_logps/rejected": -270.424072265625, "eval_loss": 0.6467114090919495, "eval_rewards/accuracies": 0.6700000166893005, "eval_rewards/chosen": -0.1409357637166977, "eval_rewards/margins": 0.11758824437856674, "eval_rewards/rejected": -0.2585240304470062, "eval_runtime": 1082.1201, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 600 }, { "epoch": 0.04, "grad_norm": 8.25, "learning_rate": 1.994767822105952e-06, "logits/chosen": -2.8360161781311035, "logits/rejected": -2.872096538543701, "logps/chosen": -295.8143005371094, "logps/rejected": -272.79766845703125, "loss": 0.6288, "rewards/accuracies": 0.75, "rewards/chosen": -0.10126857459545135, "rewards/margins": 0.15675410628318787, "rewards/rejected": -0.258022665977478, "step": 610 }, { "epoch": 0.04, "grad_norm": 7.1875, "learning_rate": 2.0274689339437543e-06, "logits/chosen": -2.8130650520324707, "logits/rejected": -2.8308346271514893, "logps/chosen": -303.5519104003906, "logps/rejected": -322.0184326171875, "loss": 0.6328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10717234760522842, "rewards/margins": 0.15358594059944153, "rewards/rejected": -0.26075828075408936, "step": 620 }, { "epoch": 0.04, "grad_norm": 3.71875, "learning_rate": 2.0601700457815567e-06, "logits/chosen": -2.657656669616699, "logits/rejected": -2.6828386783599854, "logps/chosen": -260.2118225097656, "logps/rejected": -181.19747924804688, "loss": 0.6164, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09443898499011993, "rewards/margins": 0.19535037875175476, "rewards/rejected": -0.2897893786430359, "step": 630 }, { "epoch": 0.04, "grad_norm": 20.0, "learning_rate": 2.092871157619359e-06, "logits/chosen": -2.8782541751861572, "logits/rejected": -2.8321635723114014, "logps/chosen": -278.0833740234375, "logps/rejected": -279.48065185546875, "loss": 0.6539, "rewards/accuracies": 0.625, "rewards/chosen": -0.1276673972606659, "rewards/margins": 0.1114712506532669, "rewards/rejected": -0.2391386479139328, "step": 640 }, { "epoch": 0.04, "grad_norm": 15.375, "learning_rate": 2.1255722694571616e-06, "logits/chosen": -2.9170284271240234, "logits/rejected": -2.8363306522369385, "logps/chosen": -301.1606140136719, "logps/rejected": -331.51568603515625, "loss": 0.6375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1267838180065155, "rewards/margins": 0.14333856105804443, "rewards/rejected": -0.27012234926223755, "step": 650 }, { "epoch": 0.04, "grad_norm": 5.03125, "learning_rate": 2.158273381294964e-06, "logits/chosen": -2.9045729637145996, "logits/rejected": -2.8085038661956787, "logps/chosen": -337.9372253417969, "logps/rejected": -291.80389404296875, "loss": 0.6119, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09674396365880966, "rewards/margins": 0.20411252975463867, "rewards/rejected": -0.30085647106170654, "step": 660 }, { "epoch": 0.04, "grad_norm": 8.125, "learning_rate": 2.190974493132767e-06, "logits/chosen": -2.8718059062957764, "logits/rejected": -2.7343688011169434, "logps/chosen": -290.7283935546875, "logps/rejected": -260.84552001953125, "loss": 0.6634, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2317420244216919, "rewards/margins": 0.10202614217996597, "rewards/rejected": -0.33376818895339966, "step": 670 }, { "epoch": 0.04, "grad_norm": 6.53125, "learning_rate": 2.223675604970569e-06, "logits/chosen": -2.7919726371765137, "logits/rejected": -2.766650676727295, "logps/chosen": -238.778076171875, "logps/rejected": -240.41845703125, "loss": 0.624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20796601474285126, "rewards/margins": 0.17475023865699768, "rewards/rejected": -0.38271623849868774, "step": 680 }, { "epoch": 0.05, "grad_norm": 16.125, "learning_rate": 2.2563767168083718e-06, "logits/chosen": -2.959176540374756, "logits/rejected": -2.8867645263671875, "logps/chosen": -313.923828125, "logps/rejected": -267.5486755371094, "loss": 0.6521, "rewards/accuracies": 0.625, "rewards/chosen": -0.28058522939682007, "rewards/margins": 0.1260571926832199, "rewards/rejected": -0.40664243698120117, "step": 690 }, { "epoch": 0.05, "grad_norm": 4.4375, "learning_rate": 2.289077828646174e-06, "logits/chosen": -2.8358845710754395, "logits/rejected": -2.8310768604278564, "logps/chosen": -270.5147399902344, "logps/rejected": -293.17327880859375, "loss": 0.6614, "rewards/accuracies": 0.625, "rewards/chosen": -0.4039418697357178, "rewards/margins": 0.0859096497297287, "rewards/rejected": -0.48985153436660767, "step": 700 }, { "epoch": 0.05, "eval_logits/chosen": -2.8265833854675293, "eval_logits/rejected": -2.800459384918213, "eval_logps/chosen": -309.73919677734375, "eval_logps/rejected": -299.82574462890625, "eval_loss": 0.6530683636665344, "eval_rewards/accuracies": 0.656000018119812, "eval_rewards/chosen": -0.4511961340904236, "eval_rewards/margins": 0.10134478658437729, "eval_rewards/rejected": -0.5525408983230591, "eval_runtime": 1081.7611, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 700 }, { "epoch": 0.05, "grad_norm": 10.5, "learning_rate": 2.3217789404839766e-06, "logits/chosen": -2.662567615509033, "logits/rejected": -2.6417489051818848, "logps/chosen": -313.07159423828125, "logps/rejected": -304.4773864746094, "loss": 0.6267, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3954770565032959, "rewards/margins": 0.16232043504714966, "rewards/rejected": -0.5577974915504456, "step": 710 }, { "epoch": 0.05, "grad_norm": 6.09375, "learning_rate": 2.354480052321779e-06, "logits/chosen": -2.975496768951416, "logits/rejected": -2.8815956115722656, "logps/chosen": -303.1222229003906, "logps/rejected": -314.8261413574219, "loss": 0.6441, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.35462966561317444, "rewards/margins": 0.13860280811786652, "rewards/rejected": -0.4932324290275574, "step": 720 }, { "epoch": 0.05, "grad_norm": 8.9375, "learning_rate": 2.3871811641595815e-06, "logits/chosen": -2.8100228309631348, "logits/rejected": -2.8001859188079834, "logps/chosen": -262.35009765625, "logps/rejected": -257.75103759765625, "loss": 0.6684, "rewards/accuracies": 0.625, "rewards/chosen": -0.299233078956604, "rewards/margins": 0.08083197474479675, "rewards/rejected": -0.38006505370140076, "step": 730 }, { "epoch": 0.05, "grad_norm": 6.96875, "learning_rate": 2.4198822759973843e-06, "logits/chosen": -2.907302141189575, "logits/rejected": -2.857713222503662, "logps/chosen": -245.0203857421875, "logps/rejected": -235.4355010986328, "loss": 0.5818, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09289664030075073, "rewards/margins": 0.2638043463230133, "rewards/rejected": -0.35670095682144165, "step": 740 }, { "epoch": 0.05, "grad_norm": 4.53125, "learning_rate": 2.4525833878351864e-06, "logits/chosen": -2.876056432723999, "logits/rejected": -2.859523296356201, "logps/chosen": -298.14312744140625, "logps/rejected": -321.460693359375, "loss": 0.6486, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.19290563464164734, "rewards/margins": 0.12680259346961975, "rewards/rejected": -0.3197082281112671, "step": 750 }, { "epoch": 0.05, "grad_norm": 4.9375, "learning_rate": 2.4852844996729892e-06, "logits/chosen": -2.8413796424865723, "logits/rejected": -2.7181670665740967, "logps/chosen": -285.8565673828125, "logps/rejected": -277.8565673828125, "loss": 0.6385, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.14745251834392548, "rewards/margins": 0.14486998319625854, "rewards/rejected": -0.2923224866390228, "step": 760 }, { "epoch": 0.05, "grad_norm": 4.375, "learning_rate": 2.5179856115107916e-06, "logits/chosen": -2.6753904819488525, "logits/rejected": -2.6096136569976807, "logps/chosen": -281.56512451171875, "logps/rejected": -254.1962127685547, "loss": 0.6243, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07642120122909546, "rewards/margins": 0.1657271385192871, "rewards/rejected": -0.24214835464954376, "step": 770 }, { "epoch": 0.05, "grad_norm": 6.28125, "learning_rate": 2.5506867233485937e-06, "logits/chosen": -2.879664182662964, "logits/rejected": -2.861976146697998, "logps/chosen": -279.1064147949219, "logps/rejected": -293.08355712890625, "loss": 0.6799, "rewards/accuracies": 0.625, "rewards/chosen": -0.264035165309906, "rewards/margins": 0.08346347510814667, "rewards/rejected": -0.34749865531921387, "step": 780 }, { "epoch": 0.05, "grad_norm": 8.25, "learning_rate": 2.5833878351863965e-06, "logits/chosen": -2.8534836769104004, "logits/rejected": -2.8078601360321045, "logps/chosen": -223.9133758544922, "logps/rejected": -259.12481689453125, "loss": 0.6151, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.23626156151294708, "rewards/margins": 0.19797801971435547, "rewards/rejected": -0.43423956632614136, "step": 790 }, { "epoch": 0.05, "grad_norm": 7.4375, "learning_rate": 2.616088947024199e-06, "logits/chosen": -2.7915101051330566, "logits/rejected": -2.7566821575164795, "logps/chosen": -281.08392333984375, "logps/rejected": -280.63555908203125, "loss": 0.618, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.472278892993927, "rewards/margins": 0.20760074257850647, "rewards/rejected": -0.6798796653747559, "step": 800 }, { "epoch": 0.05, "eval_logits/chosen": -2.7835373878479004, "eval_logits/rejected": -2.767955780029297, "eval_logps/chosen": -323.9272766113281, "eval_logps/rejected": -324.0606689453125, "eval_loss": 0.628663182258606, "eval_rewards/accuracies": 0.6570000052452087, "eval_rewards/chosen": -0.5930769443511963, "eval_rewards/margins": 0.2018129676580429, "eval_rewards/rejected": -0.7948898673057556, "eval_runtime": 1081.7561, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 800 }, { "epoch": 0.05, "grad_norm": 18.75, "learning_rate": 2.6487900588620014e-06, "logits/chosen": -2.7593894004821777, "logits/rejected": -2.770810842514038, "logps/chosen": -279.38677978515625, "logps/rejected": -283.33343505859375, "loss": 0.6535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6230661869049072, "rewards/margins": 0.1362382471561432, "rewards/rejected": -0.759304404258728, "step": 810 }, { "epoch": 0.05, "grad_norm": 21.0, "learning_rate": 2.6814911706998042e-06, "logits/chosen": -2.7998344898223877, "logits/rejected": -2.7249693870544434, "logps/chosen": -313.9578552246094, "logps/rejected": -290.28497314453125, "loss": 0.6165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6581670045852661, "rewards/margins": 0.21339598298072815, "rewards/rejected": -0.8715629577636719, "step": 820 }, { "epoch": 0.05, "grad_norm": 13.875, "learning_rate": 2.7141922825376067e-06, "logits/chosen": -2.735663890838623, "logits/rejected": -2.684382200241089, "logps/chosen": -335.53485107421875, "logps/rejected": -311.4736022949219, "loss": 0.5913, "rewards/accuracies": 0.75, "rewards/chosen": -0.616738498210907, "rewards/margins": 0.2691377103328705, "rewards/rejected": -0.8858762979507446, "step": 830 }, { "epoch": 0.05, "grad_norm": 8.0, "learning_rate": 2.746893394375409e-06, "logits/chosen": -2.9455153942108154, "logits/rejected": -2.8234474658966064, "logps/chosen": -375.05645751953125, "logps/rejected": -295.7832946777344, "loss": 0.681, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.578392744064331, "rewards/margins": 0.106623575091362, "rewards/rejected": -0.6850163340568542, "step": 840 }, { "epoch": 0.06, "grad_norm": 7.28125, "learning_rate": 2.779594506213211e-06, "logits/chosen": -2.907238721847534, "logits/rejected": -2.9067509174346924, "logps/chosen": -355.4493713378906, "logps/rejected": -316.18212890625, "loss": 0.6092, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3272998631000519, "rewards/margins": 0.25181305408477783, "rewards/rejected": -0.5791130065917969, "step": 850 }, { "epoch": 0.06, "grad_norm": 9.5, "learning_rate": 2.812295618051014e-06, "logits/chosen": -2.8676223754882812, "logits/rejected": -2.8018381595611572, "logps/chosen": -322.13653564453125, "logps/rejected": -277.14459228515625, "loss": 0.5548, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.35106998682022095, "rewards/margins": 0.37932613492012024, "rewards/rejected": -0.7303961515426636, "step": 860 }, { "epoch": 0.06, "grad_norm": 7.15625, "learning_rate": 2.8449967298888164e-06, "logits/chosen": -2.9881463050842285, "logits/rejected": -2.9139270782470703, "logps/chosen": -282.9815368652344, "logps/rejected": -255.7547149658203, "loss": 0.6657, "rewards/accuracies": 0.5, "rewards/chosen": -0.4062068462371826, "rewards/margins": 0.13923940062522888, "rewards/rejected": -0.5454462766647339, "step": 870 }, { "epoch": 0.06, "grad_norm": 5.5625, "learning_rate": 2.877697841726619e-06, "logits/chosen": -2.8291215896606445, "logits/rejected": -2.934727668762207, "logps/chosen": -261.79522705078125, "logps/rejected": -320.746826171875, "loss": 0.6654, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.48628419637680054, "rewards/margins": 0.17673709988594055, "rewards/rejected": -0.6630212664604187, "step": 880 }, { "epoch": 0.06, "grad_norm": 9.25, "learning_rate": 2.9103989535644217e-06, "logits/chosen": -2.828789234161377, "logits/rejected": -2.748217821121216, "logps/chosen": -322.1888427734375, "logps/rejected": -307.00909423828125, "loss": 0.6046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4386056065559387, "rewards/margins": 0.2819632887840271, "rewards/rejected": -0.7205688953399658, "step": 890 }, { "epoch": 0.06, "grad_norm": 13.375, "learning_rate": 2.943100065402224e-06, "logits/chosen": -2.836808443069458, "logits/rejected": -2.7509636878967285, "logps/chosen": -296.68743896484375, "logps/rejected": -282.49908447265625, "loss": 0.6067, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.39099884033203125, "rewards/margins": 0.2603073716163635, "rewards/rejected": -0.6513062715530396, "step": 900 }, { "epoch": 0.06, "eval_logits/chosen": -2.7821381092071533, "eval_logits/rejected": -2.774449110031128, "eval_logps/chosen": -304.8562927246094, "eval_logps/rejected": -308.3404235839844, "eval_loss": 0.618189811706543, "eval_rewards/accuracies": 0.6735000014305115, "eval_rewards/chosen": -0.4023667573928833, "eval_rewards/margins": 0.2353210747241974, "eval_rewards/rejected": -0.6376878023147583, "eval_runtime": 1081.7592, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 900 }, { "epoch": 0.06, "grad_norm": 14.9375, "learning_rate": 2.9758011772400266e-06, "logits/chosen": -2.508796215057373, "logits/rejected": -2.489849805831909, "logps/chosen": -300.6234436035156, "logps/rejected": -316.33380126953125, "loss": 0.6146, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3689255118370056, "rewards/margins": 0.2514709532260895, "rewards/rejected": -0.6203964948654175, "step": 910 }, { "epoch": 0.06, "grad_norm": 15.1875, "learning_rate": 3.0085022890778286e-06, "logits/chosen": -2.878131866455078, "logits/rejected": -2.8262393474578857, "logps/chosen": -359.237060546875, "logps/rejected": -331.756103515625, "loss": 0.6005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5989158749580383, "rewards/margins": 0.28526726365089417, "rewards/rejected": -0.8841831088066101, "step": 920 }, { "epoch": 0.06, "grad_norm": 7.5, "learning_rate": 3.0412034009156314e-06, "logits/chosen": -2.873969316482544, "logits/rejected": -2.755413770675659, "logps/chosen": -324.1197204589844, "logps/rejected": -372.75518798828125, "loss": 0.6172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5690174102783203, "rewards/margins": 0.26125672459602356, "rewards/rejected": -0.8302741050720215, "step": 930 }, { "epoch": 0.06, "grad_norm": 10.3125, "learning_rate": 3.073904512753434e-06, "logits/chosen": -2.7132937908172607, "logits/rejected": -2.7459683418273926, "logps/chosen": -282.2320251464844, "logps/rejected": -292.9881286621094, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": -0.4607372283935547, "rewards/margins": 0.18903078138828278, "rewards/rejected": -0.6497679948806763, "step": 940 }, { "epoch": 0.06, "grad_norm": 7.28125, "learning_rate": 3.1066056245912363e-06, "logits/chosen": -2.82425594329834, "logits/rejected": -2.770087718963623, "logps/chosen": -281.05224609375, "logps/rejected": -287.1793518066406, "loss": 0.6744, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4604698121547699, "rewards/margins": 0.12202539294958115, "rewards/rejected": -0.5824951529502869, "step": 950 }, { "epoch": 0.06, "grad_norm": 11.5625, "learning_rate": 3.1393067364290387e-06, "logits/chosen": -2.671586513519287, "logits/rejected": -2.724806070327759, "logps/chosen": -276.7925109863281, "logps/rejected": -324.72723388671875, "loss": 0.6224, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5155248641967773, "rewards/margins": 0.27831584215164185, "rewards/rejected": -0.7938407063484192, "step": 960 }, { "epoch": 0.06, "grad_norm": 9.625, "learning_rate": 3.1720078482668416e-06, "logits/chosen": -2.8547513484954834, "logits/rejected": -2.7828500270843506, "logps/chosen": -331.3255615234375, "logps/rejected": -296.7285461425781, "loss": 0.6188, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7216477394104004, "rewards/margins": 0.26360782980918884, "rewards/rejected": -0.9852555394172668, "step": 970 }, { "epoch": 0.06, "grad_norm": 11.0, "learning_rate": 3.204708960104644e-06, "logits/chosen": -2.6997883319854736, "logits/rejected": -2.6977345943450928, "logps/chosen": -274.75787353515625, "logps/rejected": -266.43902587890625, "loss": 0.6373, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7571477890014648, "rewards/margins": 0.21312901377677917, "rewards/rejected": -0.9702768325805664, "step": 980 }, { "epoch": 0.06, "grad_norm": 8.125, "learning_rate": 3.237410071942446e-06, "logits/chosen": -2.8839809894561768, "logits/rejected": -2.839451789855957, "logps/chosen": -311.96038818359375, "logps/rejected": -329.8813781738281, "loss": 0.61, "rewards/accuracies": 0.75, "rewards/chosen": -0.7968910336494446, "rewards/margins": 0.22231867909431458, "rewards/rejected": -1.0192097425460815, "step": 990 }, { "epoch": 0.07, "grad_norm": 8.25, "learning_rate": 3.270111183780249e-06, "logits/chosen": -2.730861186981201, "logits/rejected": -2.7505993843078613, "logps/chosen": -377.7177734375, "logps/rejected": -356.99920654296875, "loss": 0.6175, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9969084858894348, "rewards/margins": 0.25567707419395447, "rewards/rejected": -1.252585530281067, "step": 1000 }, { "epoch": 0.07, "eval_logits/chosen": -2.7655293941497803, "eval_logits/rejected": -2.7531065940856934, "eval_logps/chosen": -364.26715087890625, "eval_logps/rejected": -365.1882019042969, "eval_loss": 0.6295114159584045, "eval_rewards/accuracies": 0.6510000228881836, "eval_rewards/chosen": -0.9964756965637207, "eval_rewards/margins": 0.2096894234418869, "eval_rewards/rejected": -1.206165075302124, "eval_runtime": 1082.3623, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 1000 }, { "epoch": 0.07, "grad_norm": 7.3125, "learning_rate": 3.3028122956180513e-06, "logits/chosen": -2.536822557449341, "logits/rejected": -2.6047489643096924, "logps/chosen": -361.06109619140625, "logps/rejected": -418.49127197265625, "loss": 0.6261, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0592323541641235, "rewards/margins": 0.20766028761863708, "rewards/rejected": -1.266892671585083, "step": 1010 }, { "epoch": 0.07, "grad_norm": 19.25, "learning_rate": 3.3355134074558538e-06, "logits/chosen": -2.8570094108581543, "logits/rejected": -2.710625648498535, "logps/chosen": -419.71673583984375, "logps/rejected": -345.802978515625, "loss": 0.6031, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9804995656013489, "rewards/margins": 0.2804907262325287, "rewards/rejected": -1.2609902620315552, "step": 1020 }, { "epoch": 0.07, "grad_norm": 11.8125, "learning_rate": 3.368214519293656e-06, "logits/chosen": -2.7744476795196533, "logits/rejected": -2.6952767372131348, "logps/chosen": -360.94818115234375, "logps/rejected": -366.3872375488281, "loss": 0.6477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9846881031990051, "rewards/margins": 0.16120806336402893, "rewards/rejected": -1.1458961963653564, "step": 1030 }, { "epoch": 0.07, "grad_norm": 8.625, "learning_rate": 3.400915631131459e-06, "logits/chosen": -2.730869770050049, "logits/rejected": -2.8384108543395996, "logps/chosen": -373.50274658203125, "logps/rejected": -334.2362060546875, "loss": 0.6174, "rewards/accuracies": 0.625, "rewards/chosen": -0.8966708183288574, "rewards/margins": 0.24548539519309998, "rewards/rejected": -1.1421562433242798, "step": 1040 }, { "epoch": 0.07, "grad_norm": 10.5, "learning_rate": 3.4336167429692615e-06, "logits/chosen": -2.75400447845459, "logits/rejected": -2.7195305824279785, "logps/chosen": -327.2931213378906, "logps/rejected": -348.9956970214844, "loss": 0.5703, "rewards/accuracies": 0.75, "rewards/chosen": -0.9459779858589172, "rewards/margins": 0.3319534957408905, "rewards/rejected": -1.2779314517974854, "step": 1050 }, { "epoch": 0.07, "grad_norm": 11.375, "learning_rate": 3.4663178548070635e-06, "logits/chosen": -2.710994005203247, "logits/rejected": -2.6512699127197266, "logps/chosen": -381.109130859375, "logps/rejected": -378.1839599609375, "loss": 0.5194, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0074336528778076, "rewards/margins": 0.5151085257530212, "rewards/rejected": -1.5225422382354736, "step": 1060 }, { "epoch": 0.07, "grad_norm": 10.625, "learning_rate": 3.499018966644866e-06, "logits/chosen": -2.612952709197998, "logits/rejected": -2.6333603858947754, "logps/chosen": -377.6126403808594, "logps/rejected": -489.4436950683594, "loss": 0.5137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1240646839141846, "rewards/margins": 0.7087768912315369, "rewards/rejected": -1.8328415155410767, "step": 1070 }, { "epoch": 0.07, "grad_norm": 11.1875, "learning_rate": 3.531720078482669e-06, "logits/chosen": -2.6375420093536377, "logits/rejected": -2.5807301998138428, "logps/chosen": -425.98468017578125, "logps/rejected": -413.052734375, "loss": 0.5579, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2555062770843506, "rewards/margins": 0.5064564943313599, "rewards/rejected": -1.761962890625, "step": 1080 }, { "epoch": 0.07, "grad_norm": 16.75, "learning_rate": 3.5644211903204712e-06, "logits/chosen": -2.72477650642395, "logits/rejected": -2.566455602645874, "logps/chosen": -453.204833984375, "logps/rejected": -371.91851806640625, "loss": 0.5685, "rewards/accuracies": 0.75, "rewards/chosen": -0.9531938433647156, "rewards/margins": 0.5403688549995422, "rewards/rejected": -1.4935626983642578, "step": 1090 }, { "epoch": 0.07, "grad_norm": 10.0625, "learning_rate": 3.5971223021582737e-06, "logits/chosen": -2.6327943801879883, "logits/rejected": -2.6187589168548584, "logps/chosen": -328.882080078125, "logps/rejected": -353.8262634277344, "loss": 0.7016, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7370535731315613, "rewards/margins": 0.18583402037620544, "rewards/rejected": -0.9228876233100891, "step": 1100 }, { "epoch": 0.07, "eval_logits/chosen": -2.6843974590301514, "eval_logits/rejected": -2.6565659046173096, "eval_logps/chosen": -320.6014709472656, "eval_logps/rejected": -337.1476135253906, "eval_loss": 0.5882494449615479, "eval_rewards/accuracies": 0.6855000257492065, "eval_rewards/chosen": -0.5598190426826477, "eval_rewards/margins": 0.3659406304359436, "eval_rewards/rejected": -0.9257596731185913, "eval_runtime": 1081.6473, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 1100 }, { "epoch": 0.07, "grad_norm": 11.9375, "learning_rate": 3.6298234139960765e-06, "logits/chosen": -2.6155037879943848, "logits/rejected": -2.6500773429870605, "logps/chosen": -306.08441162109375, "logps/rejected": -364.59991455078125, "loss": 0.5635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5728699564933777, "rewards/margins": 0.4202982783317566, "rewards/rejected": -0.9931682348251343, "step": 1110 }, { "epoch": 0.07, "grad_norm": 6.84375, "learning_rate": 3.6625245258338785e-06, "logits/chosen": -2.650627851486206, "logits/rejected": -2.719773769378662, "logps/chosen": -267.1049499511719, "logps/rejected": -309.8330993652344, "loss": 0.5773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5874170660972595, "rewards/margins": 0.3674176335334778, "rewards/rejected": -0.9548347592353821, "step": 1120 }, { "epoch": 0.07, "grad_norm": 6.875, "learning_rate": 3.695225637671681e-06, "logits/chosen": -2.692509174346924, "logits/rejected": -2.698432683944702, "logps/chosen": -327.5220642089844, "logps/rejected": -346.409912109375, "loss": 0.5551, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.43535739183425903, "rewards/margins": 0.4689648747444153, "rewards/rejected": -0.9043222665786743, "step": 1130 }, { "epoch": 0.07, "grad_norm": 10.1875, "learning_rate": 3.7279267495094834e-06, "logits/chosen": -2.5295534133911133, "logits/rejected": -2.627462863922119, "logps/chosen": -287.23797607421875, "logps/rejected": -371.944580078125, "loss": 0.5924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5558944940567017, "rewards/margins": 0.39327239990234375, "rewards/rejected": -0.9491668939590454, "step": 1140 }, { "epoch": 0.08, "grad_norm": 12.0625, "learning_rate": 3.7606278613472863e-06, "logits/chosen": -2.3831377029418945, "logits/rejected": -2.3231358528137207, "logps/chosen": -392.8887939453125, "logps/rejected": -367.8238525390625, "loss": 0.5845, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7180911302566528, "rewards/margins": 0.40381890535354614, "rewards/rejected": -1.1219100952148438, "step": 1150 }, { "epoch": 0.08, "grad_norm": 15.375, "learning_rate": 3.7933289731850887e-06, "logits/chosen": -2.424847364425659, "logits/rejected": -2.417250633239746, "logps/chosen": -350.7486267089844, "logps/rejected": -389.21881103515625, "loss": 0.5638, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7766641974449158, "rewards/margins": 0.48292917013168335, "rewards/rejected": -1.2595933675765991, "step": 1160 }, { "epoch": 0.08, "grad_norm": 21.125, "learning_rate": 3.826030085022891e-06, "logits/chosen": -2.4809021949768066, "logits/rejected": -2.4700427055358887, "logps/chosen": -352.4534606933594, "logps/rejected": -382.7125549316406, "loss": 0.5396, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8692823648452759, "rewards/margins": 0.5558506846427917, "rewards/rejected": -1.4251329898834229, "step": 1170 }, { "epoch": 0.08, "grad_norm": 28.5, "learning_rate": 3.858731196860693e-06, "logits/chosen": -2.5811359882354736, "logits/rejected": -2.5154001712799072, "logps/chosen": -393.1356201171875, "logps/rejected": -401.1488342285156, "loss": 0.6951, "rewards/accuracies": 0.625, "rewards/chosen": -0.9323288798332214, "rewards/margins": 0.29660099744796753, "rewards/rejected": -1.2289297580718994, "step": 1180 }, { "epoch": 0.08, "grad_norm": 10.4375, "learning_rate": 3.891432308698496e-06, "logits/chosen": -2.6071553230285645, "logits/rejected": -2.560971736907959, "logps/chosen": -278.6733093261719, "logps/rejected": -315.0905456542969, "loss": 0.6165, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7238808870315552, "rewards/margins": 0.3133261799812317, "rewards/rejected": -1.037207007408142, "step": 1190 }, { "epoch": 0.08, "grad_norm": 7.9375, "learning_rate": 3.924133420536299e-06, "logits/chosen": -2.6004879474639893, "logits/rejected": -2.622706413269043, "logps/chosen": -357.74688720703125, "logps/rejected": -443.885009765625, "loss": 0.6085, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7794581651687622, "rewards/margins": 0.4840115010738373, "rewards/rejected": -1.2634696960449219, "step": 1200 }, { "epoch": 0.08, "eval_logits/chosen": -2.5651071071624756, "eval_logits/rejected": -2.5379154682159424, "eval_logps/chosen": -356.6389465332031, "eval_logps/rejected": -383.92120361328125, "eval_loss": 0.5892837047576904, "eval_rewards/accuracies": 0.684499979019165, "eval_rewards/chosen": -0.9201933145523071, "eval_rewards/margins": 0.4733026623725891, "eval_rewards/rejected": -1.3934959173202515, "eval_runtime": 1081.272, "eval_samples_per_second": 1.85, "eval_steps_per_second": 1.85, "step": 1200 }, { "epoch": 0.08, "grad_norm": 12.125, "learning_rate": 3.956834532374101e-06, "logits/chosen": -2.5829615592956543, "logits/rejected": -2.5544989109039307, "logps/chosen": -367.85943603515625, "logps/rejected": -372.7584228515625, "loss": 0.6077, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9208486676216125, "rewards/margins": 0.41644948720932007, "rewards/rejected": -1.3372982740402222, "step": 1210 }, { "epoch": 0.08, "grad_norm": 9.75, "learning_rate": 3.989535644211904e-06, "logits/chosen": -2.6294922828674316, "logits/rejected": -2.5150341987609863, "logps/chosen": -346.77911376953125, "logps/rejected": -327.0277099609375, "loss": 0.5861, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.036087989807129, "rewards/margins": 0.4412565231323242, "rewards/rejected": -1.4773445129394531, "step": 1220 }, { "epoch": 0.08, "grad_norm": 7.90625, "learning_rate": 4.022236756049706e-06, "logits/chosen": -2.432917833328247, "logits/rejected": -2.444042205810547, "logps/chosen": -434.344482421875, "logps/rejected": -448.4246520996094, "loss": 0.5686, "rewards/accuracies": 0.75, "rewards/chosen": -1.41476571559906, "rewards/margins": 0.4937022626399994, "rewards/rejected": -1.9084678888320923, "step": 1230 }, { "epoch": 0.08, "grad_norm": 14.875, "learning_rate": 4.054937867887509e-06, "logits/chosen": -2.482348680496216, "logits/rejected": -2.4706835746765137, "logps/chosen": -431.07672119140625, "logps/rejected": -536.6197509765625, "loss": 0.4504, "rewards/accuracies": 0.75, "rewards/chosen": -1.791180968284607, "rewards/margins": 0.8711856603622437, "rewards/rejected": -2.6623663902282715, "step": 1240 }, { "epoch": 0.08, "grad_norm": 5.96875, "learning_rate": 4.087638979725311e-06, "logits/chosen": -2.411417007446289, "logits/rejected": -2.405646562576294, "logps/chosen": -448.13067626953125, "logps/rejected": -463.6844177246094, "loss": 0.6603, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9602562189102173, "rewards/margins": 0.40531787276268005, "rewards/rejected": -2.3655741214752197, "step": 1250 }, { "epoch": 0.08, "grad_norm": 10.75, "learning_rate": 4.1203400915631135e-06, "logits/chosen": -2.5700125694274902, "logits/rejected": -2.44477915763855, "logps/chosen": -513.4207763671875, "logps/rejected": -484.97882080078125, "loss": 0.7518, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.048246383666992, "rewards/margins": 0.20837295055389404, "rewards/rejected": -2.2566192150115967, "step": 1260 }, { "epoch": 0.08, "grad_norm": 9.3125, "learning_rate": 4.153041203400916e-06, "logits/chosen": -2.57838773727417, "logits/rejected": -2.4079177379608154, "logps/chosen": -384.0234680175781, "logps/rejected": -342.88641357421875, "loss": 0.6869, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4267017841339111, "rewards/margins": 0.24452288448810577, "rewards/rejected": -1.671224594116211, "step": 1270 }, { "epoch": 0.08, "grad_norm": 10.4375, "learning_rate": 4.185742315238718e-06, "logits/chosen": -2.6056461334228516, "logits/rejected": -2.6117546558380127, "logps/chosen": -309.5303649902344, "logps/rejected": -338.7605895996094, "loss": 0.5554, "rewards/accuracies": 0.75, "rewards/chosen": -0.7388064861297607, "rewards/margins": 0.4028482437133789, "rewards/rejected": -1.1416547298431396, "step": 1280 }, { "epoch": 0.08, "grad_norm": 14.4375, "learning_rate": 4.218443427076521e-06, "logits/chosen": -2.642995834350586, "logits/rejected": -2.508042335510254, "logps/chosen": -411.256103515625, "logps/rejected": -341.0725402832031, "loss": 0.5329, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8479703664779663, "rewards/margins": 0.4832783341407776, "rewards/rejected": -1.3312487602233887, "step": 1290 }, { "epoch": 0.09, "grad_norm": 9.6875, "learning_rate": 4.251144538914323e-06, "logits/chosen": -2.507439374923706, "logits/rejected": -2.515289306640625, "logps/chosen": -323.7415771484375, "logps/rejected": -353.20977783203125, "loss": 0.6945, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.925136387348175, "rewards/margins": 0.18495909869670868, "rewards/rejected": -1.1100956201553345, "step": 1300 }, { "epoch": 0.09, "eval_logits/chosen": -2.565683603286743, "eval_logits/rejected": -2.5286052227020264, "eval_logps/chosen": -342.0801696777344, "eval_logps/rejected": -366.70947265625, "eval_loss": 0.5812696814537048, "eval_rewards/accuracies": 0.6855000257492065, "eval_rewards/chosen": -0.7746054530143738, "eval_rewards/margins": 0.44677236676216125, "eval_rewards/rejected": -1.2213780879974365, "eval_runtime": 1082.3627, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 1300 }, { "epoch": 0.09, "grad_norm": 13.0625, "learning_rate": 4.283845650752126e-06, "logits/chosen": -2.568589687347412, "logits/rejected": -2.501283884048462, "logps/chosen": -359.6022033691406, "logps/rejected": -410.34661865234375, "loss": 0.5426, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.892835795879364, "rewards/margins": 0.5845292806625366, "rewards/rejected": -1.4773650169372559, "step": 1310 }, { "epoch": 0.09, "grad_norm": 5.03125, "learning_rate": 4.316546762589928e-06, "logits/chosen": -2.4889063835144043, "logits/rejected": -2.451091766357422, "logps/chosen": -339.49261474609375, "logps/rejected": -398.74066162109375, "loss": 0.4442, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7733551263809204, "rewards/margins": 0.7722034454345703, "rewards/rejected": -1.5455586910247803, "step": 1320 }, { "epoch": 0.09, "grad_norm": 9.625, "learning_rate": 4.349247874427731e-06, "logits/chosen": -2.2700552940368652, "logits/rejected": -2.307880163192749, "logps/chosen": -390.5911560058594, "logps/rejected": -418.5215759277344, "loss": 0.612, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2193593978881836, "rewards/margins": 0.4874635338783264, "rewards/rejected": -1.7068229913711548, "step": 1330 }, { "epoch": 0.09, "grad_norm": 20.25, "learning_rate": 4.381948986265534e-06, "logits/chosen": -2.28364634513855, "logits/rejected": -2.394418478012085, "logps/chosen": -381.5599670410156, "logps/rejected": -459.710693359375, "loss": 0.5927, "rewards/accuracies": 0.75, "rewards/chosen": -1.9651191234588623, "rewards/margins": 0.36538931727409363, "rewards/rejected": -2.330508232116699, "step": 1340 }, { "epoch": 0.09, "grad_norm": 7.0, "learning_rate": 4.414650098103336e-06, "logits/chosen": -2.3645377159118652, "logits/rejected": -2.396665096282959, "logps/chosen": -475.80572509765625, "logps/rejected": -518.6478881835938, "loss": 0.5815, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8102824687957764, "rewards/margins": 0.5250512361526489, "rewards/rejected": -2.3353335857391357, "step": 1350 }, { "epoch": 0.09, "grad_norm": 6.3125, "learning_rate": 4.447351209941138e-06, "logits/chosen": -2.285235643386841, "logits/rejected": -2.2442283630371094, "logps/chosen": -471.9356994628906, "logps/rejected": -482.18817138671875, "loss": 0.5389, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9217197895050049, "rewards/margins": 0.6087549328804016, "rewards/rejected": -2.530474901199341, "step": 1360 }, { "epoch": 0.09, "grad_norm": 10.8125, "learning_rate": 4.480052321778941e-06, "logits/chosen": -2.3508994579315186, "logits/rejected": -2.343384265899658, "logps/chosen": -429.74639892578125, "logps/rejected": -492.23944091796875, "loss": 0.5687, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6354624032974243, "rewards/margins": 0.6040918827056885, "rewards/rejected": -2.2395541667938232, "step": 1370 }, { "epoch": 0.09, "grad_norm": 10.6875, "learning_rate": 4.5127534336167435e-06, "logits/chosen": -2.2081732749938965, "logits/rejected": -2.284060478210449, "logps/chosen": -354.6275329589844, "logps/rejected": -442.61376953125, "loss": 0.5947, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4146260023117065, "rewards/margins": 0.4029269218444824, "rewards/rejected": -1.8175528049468994, "step": 1380 }, { "epoch": 0.09, "grad_norm": 7.8125, "learning_rate": 4.5454545454545455e-06, "logits/chosen": -2.1286473274230957, "logits/rejected": -1.9813802242279053, "logps/chosen": -408.2415771484375, "logps/rejected": -418.65838623046875, "loss": 0.5544, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1932270526885986, "rewards/margins": 0.5883380770683289, "rewards/rejected": -1.7815649509429932, "step": 1390 }, { "epoch": 0.09, "grad_norm": 12.4375, "learning_rate": 4.578155657292348e-06, "logits/chosen": -2.1994805335998535, "logits/rejected": -2.2329888343811035, "logps/chosen": -396.0263366699219, "logps/rejected": -445.6288146972656, "loss": 0.5341, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.288704514503479, "rewards/margins": 0.6813798546791077, "rewards/rejected": -1.9700844287872314, "step": 1400 }, { "epoch": 0.09, "eval_logits/chosen": -2.2154128551483154, "eval_logits/rejected": -2.161238670349121, "eval_logps/chosen": -415.0721740722656, "eval_logps/rejected": -445.2536315917969, "eval_loss": 0.6005379557609558, "eval_rewards/accuracies": 0.6700000166893005, "eval_rewards/chosen": -1.5045257806777954, "eval_rewards/margins": 0.5022938847541809, "eval_rewards/rejected": -2.006819725036621, "eval_runtime": 1081.8275, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 1400 }, { "epoch": 0.09, "grad_norm": 9.3125, "learning_rate": 4.610856769130151e-06, "logits/chosen": -2.1071534156799316, "logits/rejected": -2.0689315795898438, "logps/chosen": -402.35955810546875, "logps/rejected": -472.33673095703125, "loss": 0.456, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.336113691329956, "rewards/margins": 0.9264262914657593, "rewards/rejected": -2.2625396251678467, "step": 1410 }, { "epoch": 0.09, "grad_norm": 25.5, "learning_rate": 4.643557880967953e-06, "logits/chosen": -2.2486603260040283, "logits/rejected": -2.0841245651245117, "logps/chosen": -451.0216369628906, "logps/rejected": -514.2208251953125, "loss": 0.5279, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5336716175079346, "rewards/margins": 0.8495262265205383, "rewards/rejected": -2.383197784423828, "step": 1420 }, { "epoch": 0.09, "grad_norm": 33.25, "learning_rate": 4.676258992805755e-06, "logits/chosen": -2.1530425548553467, "logits/rejected": -1.9252078533172607, "logps/chosen": -468.1920471191406, "logps/rejected": -471.0855407714844, "loss": 0.593, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7321354150772095, "rewards/margins": 0.6901968121528625, "rewards/rejected": -2.422332286834717, "step": 1430 }, { "epoch": 0.09, "grad_norm": 10.5625, "learning_rate": 4.708960104643558e-06, "logits/chosen": -2.1906585693359375, "logits/rejected": -2.231067180633545, "logps/chosen": -406.8133544921875, "logps/rejected": -453.7689514160156, "loss": 0.6822, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.328041672706604, "rewards/margins": 0.42143169045448303, "rewards/rejected": -1.749473214149475, "step": 1440 }, { "epoch": 0.09, "grad_norm": 14.0625, "learning_rate": 4.741661216481361e-06, "logits/chosen": -1.9857046604156494, "logits/rejected": -1.9321680068969727, "logps/chosen": -384.687744140625, "logps/rejected": -410.2315368652344, "loss": 0.5566, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8782261610031128, "rewards/margins": 0.540283203125, "rewards/rejected": -1.4185093641281128, "step": 1450 }, { "epoch": 0.1, "grad_norm": 19.875, "learning_rate": 4.774362328319163e-06, "logits/chosen": -2.388868808746338, "logits/rejected": -2.2179524898529053, "logps/chosen": -331.8688049316406, "logps/rejected": -368.47784423828125, "loss": 0.5054, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.46308690309524536, "rewards/margins": 0.5968219041824341, "rewards/rejected": -1.0599087476730347, "step": 1460 }, { "epoch": 0.1, "grad_norm": 37.75, "learning_rate": 4.807063440156966e-06, "logits/chosen": -2.178821086883545, "logits/rejected": -2.1437385082244873, "logps/chosen": -271.39105224609375, "logps/rejected": -327.5812683105469, "loss": 0.5609, "rewards/accuracies": 0.625, "rewards/chosen": -0.5633358955383301, "rewards/margins": 0.5271973609924316, "rewards/rejected": -1.0905331373214722, "step": 1470 }, { "epoch": 0.1, "grad_norm": 7.8125, "learning_rate": 4.839764551994769e-06, "logits/chosen": -2.09272837638855, "logits/rejected": -2.029414415359497, "logps/chosen": -316.9127197265625, "logps/rejected": -308.25146484375, "loss": 0.568, "rewards/accuracies": 0.75, "rewards/chosen": -0.4745386242866516, "rewards/margins": 0.44215574860572815, "rewards/rejected": -0.9166943430900574, "step": 1480 }, { "epoch": 0.1, "grad_norm": 10.375, "learning_rate": 4.872465663832571e-06, "logits/chosen": -2.024477481842041, "logits/rejected": -1.9662601947784424, "logps/chosen": -348.9256591796875, "logps/rejected": -365.7204895019531, "loss": 0.5418, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8873903155326843, "rewards/margins": 0.6633241176605225, "rewards/rejected": -1.5507144927978516, "step": 1490 }, { "epoch": 0.1, "grad_norm": 34.5, "learning_rate": 4.905166775670373e-06, "logits/chosen": -2.0312588214874268, "logits/rejected": -2.0548675060272217, "logps/chosen": -315.7076110839844, "logps/rejected": -411.4384765625, "loss": 0.5724, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1290388107299805, "rewards/margins": 0.6812268495559692, "rewards/rejected": -1.8102657794952393, "step": 1500 }, { "epoch": 0.1, "eval_logits/chosen": -1.9874122142791748, "eval_logits/rejected": -1.9123127460479736, "eval_logps/chosen": -388.1943359375, "eval_logps/rejected": -436.26513671875, "eval_loss": 0.5870843529701233, "eval_rewards/accuracies": 0.6890000104904175, "eval_rewards/chosen": -1.235747218132019, "eval_rewards/margins": 0.6811867356300354, "eval_rewards/rejected": -1.9169338941574097, "eval_runtime": 1082.704, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 1500 }, { "epoch": 0.1, "grad_norm": 17.5, "learning_rate": 4.9378678875081756e-06, "logits/chosen": -2.0847742557525635, "logits/rejected": -2.0252161026000977, "logps/chosen": -428.1272888183594, "logps/rejected": -451.4873046875, "loss": 0.6095, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.160365104675293, "rewards/margins": 0.6618114709854126, "rewards/rejected": -1.8221765756607056, "step": 1510 }, { "epoch": 0.1, "grad_norm": 10.6875, "learning_rate": 4.9705689993459784e-06, "logits/chosen": -2.0510177612304688, "logits/rejected": -1.9776346683502197, "logps/chosen": -294.0181884765625, "logps/rejected": -333.2395935058594, "loss": 0.6073, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9080532789230347, "rewards/margins": 0.4232449531555176, "rewards/rejected": -1.3312981128692627, "step": 1520 }, { "epoch": 0.1, "grad_norm": 13.5625, "learning_rate": 4.999999934784367e-06, "logits/chosen": -2.0630900859832764, "logits/rejected": -1.929762601852417, "logps/chosen": -337.39300537109375, "logps/rejected": -304.6712341308594, "loss": 0.606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8546848297119141, "rewards/margins": 0.3453328609466553, "rewards/rejected": -1.2000176906585693, "step": 1530 }, { "epoch": 0.1, "grad_norm": 66.5, "learning_rate": 4.999992108912479e-06, "logits/chosen": -2.0287814140319824, "logits/rejected": -2.0112826824188232, "logps/chosen": -339.3333740234375, "logps/rejected": -377.3528137207031, "loss": 0.6561, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0236115455627441, "rewards/margins": 0.36071905493736267, "rewards/rejected": -1.3843306303024292, "step": 1540 }, { "epoch": 0.1, "grad_norm": 3.90625, "learning_rate": 4.9999712399607005e-06, "logits/chosen": -2.300098419189453, "logits/rejected": -2.225788116455078, "logps/chosen": -392.96258544921875, "logps/rejected": -414.57928466796875, "loss": 0.6337, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0527291297912598, "rewards/margins": 0.5212673544883728, "rewards/rejected": -1.5739964246749878, "step": 1550 }, { "epoch": 0.1, "grad_norm": 9.4375, "learning_rate": 4.99993732803791e-06, "logits/chosen": -1.9543521404266357, "logits/rejected": -1.8255802392959595, "logps/chosen": -308.7613220214844, "logps/rejected": -409.419189453125, "loss": 0.5494, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9586723446846008, "rewards/margins": 0.5728462338447571, "rewards/rejected": -1.531518578529358, "step": 1560 }, { "epoch": 0.1, "grad_norm": 17.625, "learning_rate": 4.999890373321034e-06, "logits/chosen": -1.7959396839141846, "logits/rejected": -1.7996829748153687, "logps/chosen": -347.15582275390625, "logps/rejected": -409.156494140625, "loss": 0.5462, "rewards/accuracies": 0.625, "rewards/chosen": -1.2051854133605957, "rewards/margins": 0.7283663153648376, "rewards/rejected": -1.9335517883300781, "step": 1570 }, { "epoch": 0.1, "grad_norm": 15.25, "learning_rate": 4.999830376055046e-06, "logits/chosen": -2.0862925052642822, "logits/rejected": -1.9265663623809814, "logps/chosen": -381.501708984375, "logps/rejected": -440.52545166015625, "loss": 0.6715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4881706237792969, "rewards/margins": 0.4534444808959961, "rewards/rejected": -1.941615104675293, "step": 1580 }, { "epoch": 0.1, "grad_norm": 11.5625, "learning_rate": 4.999757336552969e-06, "logits/chosen": -2.264866352081299, "logits/rejected": -2.2935473918914795, "logps/chosen": -434.7684631347656, "logps/rejected": -481.9228515625, "loss": 0.651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.164011836051941, "rewards/margins": 0.39185336232185364, "rewards/rejected": -1.5558651685714722, "step": 1590 }, { "epoch": 0.1, "grad_norm": 10.5625, "learning_rate": 4.9996712551958656e-06, "logits/chosen": -2.290083169937134, "logits/rejected": -2.269726514816284, "logps/chosen": -344.64501953125, "logps/rejected": -385.2235412597656, "loss": 0.5714, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6874070763587952, "rewards/margins": 0.45198971033096313, "rewards/rejected": -1.1393969058990479, "step": 1600 }, { "epoch": 0.1, "eval_logits/chosen": -2.420902729034424, "eval_logits/rejected": -2.3794453144073486, "eval_logps/chosen": -334.2464599609375, "eval_logps/rejected": -351.0529479980469, "eval_loss": 0.6158822178840637, "eval_rewards/accuracies": 0.6614999771118164, "eval_rewards/chosen": -0.6962687969207764, "eval_rewards/margins": 0.3685435652732849, "eval_rewards/rejected": -1.0648125410079956, "eval_runtime": 1082.0523, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 1600 }, { "epoch": 0.11, "grad_norm": 7.625, "learning_rate": 4.999572132432847e-06, "logits/chosen": -2.082742929458618, "logits/rejected": -2.2987895011901855, "logps/chosen": -277.6634521484375, "logps/rejected": -354.29766845703125, "loss": 0.4835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6572118997573853, "rewards/margins": 0.6925786733627319, "rewards/rejected": -1.3497905731201172, "step": 1610 }, { "epoch": 0.11, "grad_norm": 17.125, "learning_rate": 4.999459968781057e-06, "logits/chosen": -2.171628713607788, "logits/rejected": -1.9390701055526733, "logps/chosen": -381.68695068359375, "logps/rejected": -390.44110107421875, "loss": 0.5671, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0114991664886475, "rewards/margins": 0.5344728231430054, "rewards/rejected": -1.5459719896316528, "step": 1620 }, { "epoch": 0.11, "grad_norm": 13.75, "learning_rate": 4.999334764825685e-06, "logits/chosen": -2.036154270172119, "logits/rejected": -2.0784130096435547, "logps/chosen": -393.79364013671875, "logps/rejected": -502.73345947265625, "loss": 0.7009, "rewards/accuracies": 0.625, "rewards/chosen": -1.6744928359985352, "rewards/margins": 0.5203332901000977, "rewards/rejected": -2.1948258876800537, "step": 1630 }, { "epoch": 0.11, "grad_norm": 10.0625, "learning_rate": 4.999196521219949e-06, "logits/chosen": -2.2324390411376953, "logits/rejected": -2.1614270210266113, "logps/chosen": -390.46063232421875, "logps/rejected": -512.9299926757812, "loss": 0.5506, "rewards/accuracies": 0.75, "rewards/chosen": -1.3487623929977417, "rewards/margins": 0.74602210521698, "rewards/rejected": -2.0947844982147217, "step": 1640 }, { "epoch": 0.11, "grad_norm": 10.5, "learning_rate": 4.999045238685101e-06, "logits/chosen": -2.241751194000244, "logits/rejected": -2.139235734939575, "logps/chosen": -511.9755859375, "logps/rejected": -507.98394775390625, "loss": 0.5243, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4565668106079102, "rewards/margins": 0.8773579597473145, "rewards/rejected": -2.3339247703552246, "step": 1650 }, { "epoch": 0.11, "grad_norm": 24.0, "learning_rate": 4.9988809180104195e-06, "logits/chosen": -1.832192063331604, "logits/rejected": -1.8446518182754517, "logps/chosen": -436.15771484375, "logps/rejected": -451.14306640625, "loss": 0.5947, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.96232008934021, "rewards/margins": 0.6292992830276489, "rewards/rejected": -2.5916194915771484, "step": 1660 }, { "epoch": 0.11, "grad_norm": 19.375, "learning_rate": 4.998703560053207e-06, "logits/chosen": -2.1415324211120605, "logits/rejected": -1.9868972301483154, "logps/chosen": -487.3998107910156, "logps/rejected": -560.1884765625, "loss": 0.7178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.562431573867798, "rewards/margins": 0.3532525300979614, "rewards/rejected": -2.915684223175049, "step": 1670 }, { "epoch": 0.11, "grad_norm": 18.5, "learning_rate": 4.998513165738781e-06, "logits/chosen": -2.0909481048583984, "logits/rejected": -2.061009645462036, "logps/chosen": -498.45635986328125, "logps/rejected": -538.2686157226562, "loss": 0.6784, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.7231509685516357, "rewards/margins": 0.2796917259693146, "rewards/rejected": -3.002842903137207, "step": 1680 }, { "epoch": 0.11, "grad_norm": 8.8125, "learning_rate": 4.99830973606048e-06, "logits/chosen": -2.0460152626037598, "logits/rejected": -2.048966884613037, "logps/chosen": -547.4125366210938, "logps/rejected": -609.0084228515625, "loss": 0.5337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4459853172302246, "rewards/margins": 0.7083240747451782, "rewards/rejected": -3.1543097496032715, "step": 1690 }, { "epoch": 0.11, "grad_norm": 12.375, "learning_rate": 4.998093272079645e-06, "logits/chosen": -2.0938758850097656, "logits/rejected": -2.140860080718994, "logps/chosen": -522.2265014648438, "logps/rejected": -567.4566650390625, "loss": 0.5017, "rewards/accuracies": 0.75, "rewards/chosen": -2.489461660385132, "rewards/margins": 0.7780245542526245, "rewards/rejected": -3.267486572265625, "step": 1700 }, { "epoch": 0.11, "eval_logits/chosen": -2.1617910861968994, "eval_logits/rejected": -2.128342628479004, "eval_logps/chosen": -514.8108520507812, "eval_logps/rejected": -541.3612670898438, "eval_loss": 0.6452844142913818, "eval_rewards/accuracies": 0.6430000066757202, "eval_rewards/chosen": -2.5019121170043945, "eval_rewards/margins": 0.46598345041275024, "eval_rewards/rejected": -2.967895746231079, "eval_runtime": 1082.0269, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 1700 }, { "epoch": 0.11, "grad_norm": 12.375, "learning_rate": 4.997863774925622e-06, "logits/chosen": -2.259403705596924, "logits/rejected": -2.224635601043701, "logps/chosen": -471.78997802734375, "logps/rejected": -516.01123046875, "loss": 0.5726, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0811831951141357, "rewards/margins": 0.5576003789901733, "rewards/rejected": -2.6387836933135986, "step": 1710 }, { "epoch": 0.11, "grad_norm": 18.625, "learning_rate": 4.997621245795755e-06, "logits/chosen": -2.3783888816833496, "logits/rejected": -2.369506359100342, "logps/chosen": -489.11431884765625, "logps/rejected": -451.7945251464844, "loss": 0.862, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1857175827026367, "rewards/margins": 0.042393505573272705, "rewards/rejected": -2.2281107902526855, "step": 1720 }, { "epoch": 0.11, "grad_norm": 12.625, "learning_rate": 4.99736568595538e-06, "logits/chosen": -2.418496608734131, "logits/rejected": -2.335695505142212, "logps/chosen": -418.7733459472656, "logps/rejected": -472.39013671875, "loss": 0.6731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8454335927963257, "rewards/margins": 0.26266270875930786, "rewards/rejected": -2.1080963611602783, "step": 1730 }, { "epoch": 0.11, "grad_norm": 15.75, "learning_rate": 4.997097096737815e-06, "logits/chosen": -2.1312708854675293, "logits/rejected": -2.091052532196045, "logps/chosen": -410.658203125, "logps/rejected": -427.51251220703125, "loss": 0.6975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0052640438079834, "rewards/margins": 0.31737107038497925, "rewards/rejected": -2.3226351737976074, "step": 1740 }, { "epoch": 0.11, "grad_norm": 13.75, "learning_rate": 4.996815479544358e-06, "logits/chosen": -2.230217456817627, "logits/rejected": -2.2023110389709473, "logps/chosen": -339.3342590332031, "logps/rejected": -368.8620910644531, "loss": 0.5565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.406976342201233, "rewards/margins": 0.48006969690322876, "rewards/rejected": -1.8870460987091064, "step": 1750 }, { "epoch": 0.12, "grad_norm": 7.40625, "learning_rate": 4.996520835844273e-06, "logits/chosen": -2.370826482772827, "logits/rejected": -2.323233127593994, "logps/chosen": -340.64984130859375, "logps/rejected": -397.90606689453125, "loss": 0.5162, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2687333822250366, "rewards/margins": 0.6566519737243652, "rewards/rejected": -1.9253854751586914, "step": 1760 }, { "epoch": 0.12, "grad_norm": 12.125, "learning_rate": 4.996213167174792e-06, "logits/chosen": -2.1642885208129883, "logits/rejected": -2.146897554397583, "logps/chosen": -310.4258728027344, "logps/rejected": -362.51556396484375, "loss": 0.5137, "rewards/accuracies": 0.75, "rewards/chosen": -1.2297184467315674, "rewards/margins": 0.699654757976532, "rewards/rejected": -1.9293733835220337, "step": 1770 }, { "epoch": 0.12, "grad_norm": 7.09375, "learning_rate": 4.995892475141098e-06, "logits/chosen": -2.162407636642456, "logits/rejected": -2.1344072818756104, "logps/chosen": -323.6151428222656, "logps/rejected": -415.458740234375, "loss": 0.6092, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2404899597167969, "rewards/margins": 0.6016384363174438, "rewards/rejected": -1.8421283960342407, "step": 1780 }, { "epoch": 0.12, "grad_norm": 16.75, "learning_rate": 4.995558761416321e-06, "logits/chosen": -2.054608106613159, "logits/rejected": -2.0189714431762695, "logps/chosen": -381.1253662109375, "logps/rejected": -434.0181579589844, "loss": 0.4936, "rewards/accuracies": 0.75, "rewards/chosen": -0.910944938659668, "rewards/margins": 0.9660493731498718, "rewards/rejected": -1.8769941329956055, "step": 1790 }, { "epoch": 0.12, "grad_norm": 35.0, "learning_rate": 4.995212027741528e-06, "logits/chosen": -2.1378448009490967, "logits/rejected": -2.0322327613830566, "logps/chosen": -514.92626953125, "logps/rejected": -492.10089111328125, "loss": 0.6473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4513241052627563, "rewards/margins": 0.5095764994621277, "rewards/rejected": -1.9609006643295288, "step": 1800 }, { "epoch": 0.12, "eval_logits/chosen": -2.2148311138153076, "eval_logits/rejected": -2.1323676109313965, "eval_logps/chosen": -390.83050537109375, "eval_logps/rejected": -434.8127746582031, "eval_loss": 0.5910107493400574, "eval_rewards/accuracies": 0.6974999904632568, "eval_rewards/chosen": -1.2621089220046997, "eval_rewards/margins": 0.6403023600578308, "eval_rewards/rejected": -1.9024112224578857, "eval_runtime": 1082.1966, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 1800 }, { "epoch": 0.12, "grad_norm": 31.75, "learning_rate": 4.9948522759257155e-06, "logits/chosen": -2.340362071990967, "logits/rejected": -2.1867499351501465, "logps/chosen": -410.2652893066406, "logps/rejected": -424.15380859375, "loss": 0.5065, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1459691524505615, "rewards/margins": 0.7540305852890015, "rewards/rejected": -1.8999998569488525, "step": 1810 }, { "epoch": 0.12, "grad_norm": 18.125, "learning_rate": 4.9944795078457985e-06, "logits/chosen": -2.207561492919922, "logits/rejected": -2.1731371879577637, "logps/chosen": -358.11334228515625, "logps/rejected": -400.5421447753906, "loss": 0.4786, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6738776564598083, "rewards/margins": 0.7908266186714172, "rewards/rejected": -1.4647042751312256, "step": 1820 }, { "epoch": 0.12, "grad_norm": 11.4375, "learning_rate": 4.994093725446599e-06, "logits/chosen": -2.1186044216156006, "logits/rejected": -1.8495346307754517, "logps/chosen": -369.7090148925781, "logps/rejected": -403.43841552734375, "loss": 0.5406, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0735784769058228, "rewards/margins": 0.6410388350486755, "rewards/rejected": -1.714617371559143, "step": 1830 }, { "epoch": 0.12, "grad_norm": 29.375, "learning_rate": 4.9936949307408415e-06, "logits/chosen": -1.8276735544204712, "logits/rejected": -1.6827329397201538, "logps/chosen": -332.9766845703125, "logps/rejected": -401.02435302734375, "loss": 0.535, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1840097904205322, "rewards/margins": 0.8062893152236938, "rewards/rejected": -1.9902989864349365, "step": 1840 }, { "epoch": 0.12, "grad_norm": 8.0625, "learning_rate": 4.993283125809135e-06, "logits/chosen": -1.9583135843276978, "logits/rejected": -1.7675187587738037, "logps/chosen": -383.54315185546875, "logps/rejected": -468.73565673828125, "loss": 0.6904, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5637145042419434, "rewards/margins": 0.7246447801589966, "rewards/rejected": -2.2883591651916504, "step": 1850 }, { "epoch": 0.12, "grad_norm": 12.75, "learning_rate": 4.99285831279997e-06, "logits/chosen": -1.8117868900299072, "logits/rejected": -1.8212286233901978, "logps/chosen": -404.87567138671875, "logps/rejected": -499.5162048339844, "loss": 0.513, "rewards/accuracies": 0.75, "rewards/chosen": -0.9862926602363586, "rewards/margins": 0.7943549156188965, "rewards/rejected": -1.7806476354599, "step": 1860 }, { "epoch": 0.12, "grad_norm": 8.625, "learning_rate": 4.9924204939296995e-06, "logits/chosen": -1.9939438104629517, "logits/rejected": -1.7611968517303467, "logps/chosen": -419.70147705078125, "logps/rejected": -384.85137939453125, "loss": 0.6281, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2581182718276978, "rewards/margins": 0.543582558631897, "rewards/rejected": -1.8017008304595947, "step": 1870 }, { "epoch": 0.12, "grad_norm": 15.25, "learning_rate": 4.9919696714825365e-06, "logits/chosen": -1.8705196380615234, "logits/rejected": -1.883033037185669, "logps/chosen": -346.4654235839844, "logps/rejected": -397.37994384765625, "loss": 0.6311, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9935609102249146, "rewards/margins": 0.6887490749359131, "rewards/rejected": -1.6823101043701172, "step": 1880 }, { "epoch": 0.12, "grad_norm": 8.875, "learning_rate": 4.99150584781053e-06, "logits/chosen": -2.0246591567993164, "logits/rejected": -1.7514724731445312, "logps/chosen": -328.065673828125, "logps/rejected": -367.72479248046875, "loss": 0.6511, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9621168375015259, "rewards/margins": 0.3358403742313385, "rewards/rejected": -1.297956943511963, "step": 1890 }, { "epoch": 0.12, "grad_norm": 18.0, "learning_rate": 4.991029025333566e-06, "logits/chosen": -2.1413633823394775, "logits/rejected": -1.983350396156311, "logps/chosen": -340.58843994140625, "logps/rejected": -405.20513916015625, "loss": 0.6148, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.041387915611267, "rewards/margins": 0.6128355264663696, "rewards/rejected": -1.6542232036590576, "step": 1900 }, { "epoch": 0.12, "eval_logits/chosen": -2.0483407974243164, "eval_logits/rejected": -1.912282109260559, "eval_logps/chosen": -355.8019714355469, "eval_logps/rejected": -395.4436340332031, "eval_loss": 0.5745924115180969, "eval_rewards/accuracies": 0.7014999985694885, "eval_rewards/chosen": -0.9118241667747498, "eval_rewards/margins": 0.596895694732666, "eval_rewards/rejected": -1.5087199211120605, "eval_runtime": 1082.8416, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 1900 }, { "epoch": 0.12, "grad_norm": 8.1875, "learning_rate": 4.990539206539344e-06, "logits/chosen": -2.0032730102539062, "logits/rejected": -1.9161975383758545, "logps/chosen": -355.62359619140625, "logps/rejected": -320.6944885253906, "loss": 0.5985, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.928227424621582, "rewards/margins": 0.4253101348876953, "rewards/rejected": -1.3535375595092773, "step": 1910 }, { "epoch": 0.13, "grad_norm": 4.09375, "learning_rate": 4.990036393983372e-06, "logits/chosen": -1.5441153049468994, "logits/rejected": -1.6281633377075195, "logps/chosen": -293.44744873046875, "logps/rejected": -443.0909118652344, "loss": 0.4687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0483801364898682, "rewards/margins": 0.9791606068611145, "rewards/rejected": -2.027540683746338, "step": 1920 }, { "epoch": 0.13, "grad_norm": 12.8125, "learning_rate": 4.989520590288947e-06, "logits/chosen": -1.397741675376892, "logits/rejected": -1.7153427600860596, "logps/chosen": -383.5269470214844, "logps/rejected": -462.8575744628906, "loss": 0.4584, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2852225303649902, "rewards/margins": 1.0747359991073608, "rewards/rejected": -2.3599584102630615, "step": 1930 }, { "epoch": 0.13, "grad_norm": 46.25, "learning_rate": 4.988991798147145e-06, "logits/chosen": -1.2112948894500732, "logits/rejected": -1.031630039215088, "logps/chosen": -463.3036193847656, "logps/rejected": -598.1160278320312, "loss": 0.6383, "rewards/accuracies": 0.625, "rewards/chosen": -2.3999953269958496, "rewards/margins": 0.9064973592758179, "rewards/rejected": -3.306492567062378, "step": 1940 }, { "epoch": 0.13, "grad_norm": 90.5, "learning_rate": 4.9884500203168066e-06, "logits/chosen": -1.129406213760376, "logits/rejected": -0.8300798535346985, "logps/chosen": -573.8423461914062, "logps/rejected": -624.1845703125, "loss": 0.6489, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.02606463432312, "rewards/margins": 1.0546177625656128, "rewards/rejected": -4.080681800842285, "step": 1950 }, { "epoch": 0.13, "grad_norm": 19.875, "learning_rate": 4.987895259624521e-06, "logits/chosen": -1.5944287776947021, "logits/rejected": -1.307204008102417, "logps/chosen": -539.3009033203125, "logps/rejected": -582.1524047851562, "loss": 0.6564, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.625255584716797, "rewards/margins": 0.6217406988143921, "rewards/rejected": -3.2469964027404785, "step": 1960 }, { "epoch": 0.13, "grad_norm": 17.0, "learning_rate": 4.987327518964613e-06, "logits/chosen": -1.5503787994384766, "logits/rejected": -1.0238831043243408, "logps/chosen": -520.8883666992188, "logps/rejected": -552.2244873046875, "loss": 0.5757, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8061327934265137, "rewards/margins": 0.6910549998283386, "rewards/rejected": -3.497187852859497, "step": 1970 }, { "epoch": 0.13, "grad_norm": 8.5625, "learning_rate": 4.986746801299125e-06, "logits/chosen": -1.1422641277313232, "logits/rejected": -1.3856160640716553, "logps/chosen": -566.09130859375, "logps/rejected": -622.683349609375, "loss": 0.5659, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.1424896717071533, "rewards/margins": 0.508002519607544, "rewards/rejected": -3.6504921913146973, "step": 1980 }, { "epoch": 0.13, "grad_norm": 42.75, "learning_rate": 4.986153109657807e-06, "logits/chosen": -1.7044328451156616, "logits/rejected": -1.4812885522842407, "logps/chosen": -635.2144775390625, "logps/rejected": -703.6630859375, "loss": 0.6452, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.3474185466766357, "rewards/margins": 0.4996156692504883, "rewards/rejected": -3.847034454345703, "step": 1990 }, { "epoch": 0.13, "grad_norm": 55.75, "learning_rate": 4.985546447138096e-06, "logits/chosen": -1.2817676067352295, "logits/rejected": -1.0377562046051025, "logps/chosen": -500.99530029296875, "logps/rejected": -518.9309692382812, "loss": 0.7404, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.368818998336792, "rewards/margins": 0.6239749789237976, "rewards/rejected": -2.9927937984466553, "step": 2000 }, { "epoch": 0.13, "eval_logits/chosen": -1.6820718050003052, "eval_logits/rejected": -1.5444999933242798, "eval_logps/chosen": -436.87420654296875, "eval_logps/rejected": -496.0523376464844, "eval_loss": 0.5778976082801819, "eval_rewards/accuracies": 0.7014999985694885, "eval_rewards/chosen": -1.7225462198257446, "eval_rewards/margins": 0.7922602891921997, "eval_rewards/rejected": -2.5148067474365234, "eval_runtime": 1083.2384, "eval_samples_per_second": 1.846, "eval_steps_per_second": 1.846, "step": 2000 }, { "epoch": 0.13, "grad_norm": 12.5625, "learning_rate": 4.984926816905099e-06, "logits/chosen": -1.5279616117477417, "logits/rejected": -1.163779377937317, "logps/chosen": -409.876220703125, "logps/rejected": -477.8462829589844, "loss": 0.5019, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5433762073516846, "rewards/margins": 0.9713126420974731, "rewards/rejected": -2.514688730239868, "step": 2010 }, { "epoch": 0.13, "grad_norm": 15.4375, "learning_rate": 4.984294222191582e-06, "logits/chosen": -1.6552928686141968, "logits/rejected": -1.7187728881835938, "logps/chosen": -368.5750732421875, "logps/rejected": -476.22967529296875, "loss": 0.6351, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3200690746307373, "rewards/margins": 0.8063483238220215, "rewards/rejected": -2.126417398452759, "step": 2020 }, { "epoch": 0.13, "grad_norm": 9.0625, "learning_rate": 4.98364866629795e-06, "logits/chosen": -1.8336403369903564, "logits/rejected": -1.843130111694336, "logps/chosen": -385.27655029296875, "logps/rejected": -396.94232177734375, "loss": 0.505, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1291840076446533, "rewards/margins": 0.6425586342811584, "rewards/rejected": -1.771742820739746, "step": 2030 }, { "epoch": 0.13, "grad_norm": 15.6875, "learning_rate": 4.982990152592228e-06, "logits/chosen": -1.7534250020980835, "logits/rejected": -1.5721526145935059, "logps/chosen": -374.6744079589844, "logps/rejected": -375.98028564453125, "loss": 0.5973, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.221355676651001, "rewards/margins": 0.5279499292373657, "rewards/rejected": -1.7493054866790771, "step": 2040 }, { "epoch": 0.13, "grad_norm": 26.5, "learning_rate": 4.982318684510044e-06, "logits/chosen": -1.427851915359497, "logits/rejected": -1.2452611923217773, "logps/chosen": -440.1607360839844, "logps/rejected": -567.3175659179688, "loss": 0.5042, "rewards/accuracies": 0.75, "rewards/chosen": -1.9199590682983398, "rewards/margins": 1.0581566095352173, "rewards/rejected": -2.9781153202056885, "step": 2050 }, { "epoch": 0.13, "grad_norm": 10.0625, "learning_rate": 4.981634265554614e-06, "logits/chosen": -1.3232144117355347, "logits/rejected": -1.2064357995986938, "logps/chosen": -429.57427978515625, "logps/rejected": -510.7268981933594, "loss": 0.5996, "rewards/accuracies": 0.625, "rewards/chosen": -2.111093044281006, "rewards/margins": 0.7634450793266296, "rewards/rejected": -2.874537706375122, "step": 2060 }, { "epoch": 0.14, "grad_norm": 24.875, "learning_rate": 4.980936899296724e-06, "logits/chosen": -1.6515147686004639, "logits/rejected": -1.5823218822479248, "logps/chosen": -406.72552490234375, "logps/rejected": -494.5320739746094, "loss": 0.5876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5486453771591187, "rewards/margins": 0.573828399181366, "rewards/rejected": -2.122473955154419, "step": 2070 }, { "epoch": 0.14, "grad_norm": 30.375, "learning_rate": 4.980226589374705e-06, "logits/chosen": -1.3826872110366821, "logits/rejected": -1.5359643697738647, "logps/chosen": -380.80194091796875, "logps/rejected": -444.154052734375, "loss": 0.5926, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4236021041870117, "rewards/margins": 0.6320102214813232, "rewards/rejected": -2.055612325668335, "step": 2080 }, { "epoch": 0.14, "grad_norm": 11.4375, "learning_rate": 4.9795033394944215e-06, "logits/chosen": -1.8715215921401978, "logits/rejected": -1.7310231924057007, "logps/chosen": -351.7757263183594, "logps/rejected": -410.4307556152344, "loss": 0.6526, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.264648675918579, "rewards/margins": 0.5617357492446899, "rewards/rejected": -1.8263843059539795, "step": 2090 }, { "epoch": 0.14, "grad_norm": 14.6875, "learning_rate": 4.978767153429246e-06, "logits/chosen": -1.7237598896026611, "logits/rejected": -1.731188178062439, "logps/chosen": -380.6629333496094, "logps/rejected": -488.7897033691406, "loss": 0.4925, "rewards/accuracies": 0.75, "rewards/chosen": -1.4598950147628784, "rewards/margins": 0.8693640828132629, "rewards/rejected": -2.329259157180786, "step": 2100 }, { "epoch": 0.14, "eval_logits/chosen": -1.4880908727645874, "eval_logits/rejected": -1.341452717781067, "eval_logps/chosen": -452.9669189453125, "eval_logps/rejected": -513.9924926757812, "eval_loss": 0.5994824171066284, "eval_rewards/accuracies": 0.6915000081062317, "eval_rewards/chosen": -1.8834730386734009, "eval_rewards/margins": 0.8107352256774902, "eval_rewards/rejected": -2.6942081451416016, "eval_runtime": 1082.9182, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 2100 }, { "epoch": 0.14, "grad_norm": 7.0625, "learning_rate": 4.978018035020047e-06, "logits/chosen": -1.3732080459594727, "logits/rejected": -1.3284085988998413, "logps/chosen": -484.8277282714844, "logps/rejected": -518.8592529296875, "loss": 0.6757, "rewards/accuracies": 0.625, "rewards/chosen": -2.011997938156128, "rewards/margins": 0.6339169144630432, "rewards/rejected": -2.6459147930145264, "step": 2110 }, { "epoch": 0.14, "grad_norm": 15.5, "learning_rate": 4.977255988175158e-06, "logits/chosen": -1.4384328126907349, "logits/rejected": -1.3010741472244263, "logps/chosen": -422.0142517089844, "logps/rejected": -524.7828979492188, "loss": 0.569, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7687381505966187, "rewards/margins": 0.7892870903015137, "rewards/rejected": -2.558025360107422, "step": 2120 }, { "epoch": 0.14, "grad_norm": 34.5, "learning_rate": 4.976481016870369e-06, "logits/chosen": -1.2089284658432007, "logits/rejected": -1.0628106594085693, "logps/chosen": -522.5753173828125, "logps/rejected": -496.86810302734375, "loss": 0.6769, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3268563747406006, "rewards/margins": 0.5063682198524475, "rewards/rejected": -2.8332247734069824, "step": 2130 }, { "epoch": 0.14, "grad_norm": 28.875, "learning_rate": 4.9756931251488965e-06, "logits/chosen": -1.2838311195373535, "logits/rejected": -1.0897486209869385, "logps/chosen": -486.38330078125, "logps/rejected": -597.7864379882812, "loss": 0.5064, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.5414881706237793, "rewards/margins": 0.9630319476127625, "rewards/rejected": -3.5045199394226074, "step": 2140 }, { "epoch": 0.14, "grad_norm": 24.875, "learning_rate": 4.974892317121368e-06, "logits/chosen": -1.3117762804031372, "logits/rejected": -1.450843095779419, "logps/chosen": -496.73760986328125, "logps/rejected": -525.486572265625, "loss": 0.7217, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.2699501514434814, "rewards/margins": 0.5521320700645447, "rewards/rejected": -2.822082281112671, "step": 2150 }, { "epoch": 0.14, "grad_norm": 11.8125, "learning_rate": 4.974078596965799e-06, "logits/chosen": -1.2212114334106445, "logits/rejected": -1.3899569511413574, "logps/chosen": -426.61126708984375, "logps/rejected": -501.4776306152344, "loss": 0.5094, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.803209662437439, "rewards/margins": 0.9061592221260071, "rewards/rejected": -2.709368944168091, "step": 2160 }, { "epoch": 0.14, "grad_norm": 24.0, "learning_rate": 4.973251968927567e-06, "logits/chosen": -1.4505054950714111, "logits/rejected": -1.2147958278656006, "logps/chosen": -512.5052490234375, "logps/rejected": -561.25390625, "loss": 0.5009, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4163036346435547, "rewards/margins": 0.8615167737007141, "rewards/rejected": -3.277820587158203, "step": 2170 }, { "epoch": 0.14, "grad_norm": 19.875, "learning_rate": 4.9724124373193985e-06, "logits/chosen": -1.426679253578186, "logits/rejected": -1.2293018102645874, "logps/chosen": -474.3233947753906, "logps/rejected": -596.445556640625, "loss": 0.5723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5306286811828613, "rewards/margins": 1.1353285312652588, "rewards/rejected": -3.66595721244812, "step": 2180 }, { "epoch": 0.14, "grad_norm": 10.0625, "learning_rate": 4.971560006521338e-06, "logits/chosen": -1.4075731039047241, "logits/rejected": -1.327178716659546, "logps/chosen": -560.4766845703125, "logps/rejected": -646.1405029296875, "loss": 0.4412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.732370376586914, "rewards/margins": 1.2725470066070557, "rewards/rejected": -4.004917144775391, "step": 2190 }, { "epoch": 0.14, "grad_norm": 16.0, "learning_rate": 4.970694680980727e-06, "logits/chosen": -0.8152478337287903, "logits/rejected": -0.5278416275978088, "logps/chosen": -673.9854125976562, "logps/rejected": -722.3944091796875, "loss": 0.6846, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.415452003479004, "rewards/margins": 0.7075659036636353, "rewards/rejected": -5.12301778793335, "step": 2200 }, { "epoch": 0.14, "eval_logits/chosen": -0.8833072185516357, "eval_logits/rejected": -0.7665340900421143, "eval_logps/chosen": -745.3392944335938, "eval_logps/rejected": -802.6066284179688, "eval_loss": 0.6260702013969421, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": -4.807196140289307, "eval_rewards/margins": 0.7731525897979736, "eval_rewards/rejected": -5.580349445343018, "eval_runtime": 1082.9444, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 2200 }, { "epoch": 0.14, "grad_norm": 23.625, "learning_rate": 4.969816465212185e-06, "logits/chosen": -1.0275278091430664, "logits/rejected": -1.0802525281906128, "logps/chosen": -761.7613525390625, "logps/rejected": -806.0810546875, "loss": 0.5098, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.714092254638672, "rewards/margins": 0.991226077079773, "rewards/rejected": -5.705317974090576, "step": 2210 }, { "epoch": 0.15, "grad_norm": 8.6875, "learning_rate": 4.968925363797581e-06, "logits/chosen": -1.152628779411316, "logits/rejected": -0.879848837852478, "logps/chosen": -623.21533203125, "logps/rejected": -626.6482543945312, "loss": 0.6156, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.390092134475708, "rewards/margins": 0.7799450755119324, "rewards/rejected": -4.170037269592285, "step": 2220 }, { "epoch": 0.15, "grad_norm": 22.875, "learning_rate": 4.968021381386014e-06, "logits/chosen": -1.6273800134658813, "logits/rejected": -1.4176734685897827, "logps/chosen": -469.6507263183594, "logps/rejected": -509.019287109375, "loss": 0.5487, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8040096759796143, "rewards/margins": 0.995147705078125, "rewards/rejected": -2.7991573810577393, "step": 2230 }, { "epoch": 0.15, "grad_norm": 8.125, "learning_rate": 4.967104522693784e-06, "logits/chosen": -1.2210497856140137, "logits/rejected": -1.4143774509429932, "logps/chosen": -359.1187438964844, "logps/rejected": -471.95428466796875, "loss": 0.5563, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6878944635391235, "rewards/margins": 0.750239372253418, "rewards/rejected": -2.438133716583252, "step": 2240 }, { "epoch": 0.15, "grad_norm": 13.1875, "learning_rate": 4.966174792504371e-06, "logits/chosen": -1.4142194986343384, "logits/rejected": -1.3239561319351196, "logps/chosen": -403.1938171386719, "logps/rejected": -446.2001953125, "loss": 0.6597, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.233825445175171, "rewards/margins": 0.48243266344070435, "rewards/rejected": -1.7162582874298096, "step": 2250 }, { "epoch": 0.15, "grad_norm": 13.875, "learning_rate": 4.965232195668408e-06, "logits/chosen": -1.495906114578247, "logits/rejected": -1.3948304653167725, "logps/chosen": -460.059326171875, "logps/rejected": -512.9342651367188, "loss": 0.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5857712030410767, "rewards/margins": 0.6473572254180908, "rewards/rejected": -2.233128547668457, "step": 2260 }, { "epoch": 0.15, "grad_norm": 13.0, "learning_rate": 4.964276737103656e-06, "logits/chosen": -1.2583377361297607, "logits/rejected": -0.9090865254402161, "logps/chosen": -398.15032958984375, "logps/rejected": -509.7491760253906, "loss": 0.3346, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.4380770921707153, "rewards/margins": 1.4196299314498901, "rewards/rejected": -2.8577072620391846, "step": 2270 }, { "epoch": 0.15, "grad_norm": 3.9375, "learning_rate": 4.9633084217949814e-06, "logits/chosen": -1.1924808025360107, "logits/rejected": -0.9396651387214661, "logps/chosen": -488.5087890625, "logps/rejected": -606.2218017578125, "loss": 0.4488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5999720096588135, "rewards/margins": 1.1447409391403198, "rewards/rejected": -3.7447128295898438, "step": 2280 }, { "epoch": 0.15, "grad_norm": 50.75, "learning_rate": 4.9623272547943245e-06, "logits/chosen": -0.4400664269924164, "logits/rejected": -0.20189061760902405, "logps/chosen": -713.979736328125, "logps/rejected": -880.3170166015625, "loss": 0.4418, "rewards/accuracies": 0.75, "rewards/chosen": -4.738163948059082, "rewards/margins": 1.5463576316833496, "rewards/rejected": -6.28452205657959, "step": 2290 }, { "epoch": 0.15, "grad_norm": 35.5, "learning_rate": 4.9613332412206775e-06, "logits/chosen": -0.22080376744270325, "logits/rejected": 0.04498801752924919, "logps/chosen": -789.7938232421875, "logps/rejected": -1016.6892700195312, "loss": 0.4865, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.565503120422363, "rewards/margins": 1.8310019969940186, "rewards/rejected": -7.396505832672119, "step": 2300 }, { "epoch": 0.15, "eval_logits/chosen": -0.027861768379807472, "eval_logits/rejected": 0.13252075016498566, "eval_logps/chosen": -883.5037231445312, "eval_logps/rejected": -997.1057739257812, "eval_loss": 0.7694874405860901, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": -6.188841819763184, "eval_rewards/margins": 1.3364986181259155, "eval_rewards/rejected": -7.525341033935547, "eval_runtime": 1083.0817, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 2300 }, { "epoch": 0.15, "grad_norm": 162.0, "learning_rate": 4.960326386260056e-06, "logits/chosen": 0.5902056097984314, "logits/rejected": 0.41426968574523926, "logps/chosen": -1001.8937377929688, "logps/rejected": -1149.8394775390625, "loss": 0.8589, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -7.8423027992248535, "rewards/margins": 1.3434627056121826, "rewards/rejected": -9.185765266418457, "step": 2310 }, { "epoch": 0.15, "grad_norm": 12.0, "learning_rate": 4.9593066951654725e-06, "logits/chosen": -0.633233904838562, "logits/rejected": -0.08909018337726593, "logps/chosen": -727.8616333007812, "logps/rejected": -839.65625, "loss": 0.7893, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.664944171905518, "rewards/margins": 1.4014666080474854, "rewards/rejected": -6.066411018371582, "step": 2320 }, { "epoch": 0.15, "grad_norm": 35.5, "learning_rate": 4.958274173256909e-06, "logits/chosen": -0.7931697964668274, "logits/rejected": -0.4919402003288269, "logps/chosen": -451.46820068359375, "logps/rejected": -522.3240966796875, "loss": 0.5859, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.391993761062622, "rewards/margins": 0.9424756169319153, "rewards/rejected": -3.3344693183898926, "step": 2330 }, { "epoch": 0.15, "grad_norm": 38.25, "learning_rate": 4.957228825921289e-06, "logits/chosen": -1.4807980060577393, "logits/rejected": -1.5006507635116577, "logps/chosen": -544.6298217773438, "logps/rejected": -602.5860595703125, "loss": 0.6878, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.169495105743408, "rewards/margins": 0.7021037936210632, "rewards/rejected": -2.8715991973876953, "step": 2340 }, { "epoch": 0.15, "grad_norm": 27.75, "learning_rate": 4.956170658612449e-06, "logits/chosen": -1.534315824508667, "logits/rejected": -1.3447673320770264, "logps/chosen": -522.5127563476562, "logps/rejected": -539.3524169921875, "loss": 0.631, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9628496170043945, "rewards/margins": 0.7016445994377136, "rewards/rejected": -2.664494276046753, "step": 2350 }, { "epoch": 0.15, "grad_norm": 14.5, "learning_rate": 4.955099676851112e-06, "logits/chosen": -1.8299968242645264, "logits/rejected": -1.6151864528656006, "logps/chosen": -453.64141845703125, "logps/rejected": -531.650634765625, "loss": 0.6059, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7843475341796875, "rewards/margins": 0.7316817045211792, "rewards/rejected": -2.5160293579101562, "step": 2360 }, { "epoch": 0.16, "grad_norm": 16.375, "learning_rate": 4.954015886224853e-06, "logits/chosen": -1.8213001489639282, "logits/rejected": -1.7565510272979736, "logps/chosen": -378.63641357421875, "logps/rejected": -430.8412170410156, "loss": 0.6472, "rewards/accuracies": 0.625, "rewards/chosen": -1.4202426671981812, "rewards/margins": 0.3710296154022217, "rewards/rejected": -1.7912724018096924, "step": 2370 }, { "epoch": 0.16, "grad_norm": 10.375, "learning_rate": 4.952919292388079e-06, "logits/chosen": -1.5351765155792236, "logits/rejected": -1.3853943347930908, "logps/chosen": -361.0724182128906, "logps/rejected": -578.3220825195312, "loss": 0.4583, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.413024663925171, "rewards/margins": 1.7087692022323608, "rewards/rejected": -3.1217939853668213, "step": 2380 }, { "epoch": 0.16, "grad_norm": 15.25, "learning_rate": 4.951809901061992e-06, "logits/chosen": -1.4425779581069946, "logits/rejected": -1.0798763036727905, "logps/chosen": -467.03118896484375, "logps/rejected": -587.8226318359375, "loss": 0.4837, "rewards/accuracies": 0.75, "rewards/chosen": -1.9386409521102905, "rewards/margins": 1.3178192377090454, "rewards/rejected": -3.256460189819336, "step": 2390 }, { "epoch": 0.16, "grad_norm": 34.5, "learning_rate": 4.950687718034563e-06, "logits/chosen": -1.2271991968154907, "logits/rejected": -1.4019076824188232, "logps/chosen": -490.3935546875, "logps/rejected": -578.2962646484375, "loss": 0.512, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.036787509918213, "rewards/margins": 0.9740701913833618, "rewards/rejected": -3.0108578205108643, "step": 2400 }, { "epoch": 0.16, "eval_logits/chosen": -1.3015968799591064, "eval_logits/rejected": -1.1445417404174805, "eval_logps/chosen": -475.35638427734375, "eval_logps/rejected": -546.8381958007812, "eval_loss": 0.5833767056465149, "eval_rewards/accuracies": 0.7005000114440918, "eval_rewards/chosen": -2.107367992401123, "eval_rewards/margins": 0.9152973294258118, "eval_rewards/rejected": -3.02266526222229, "eval_runtime": 1082.1741, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 2400 }, { "epoch": 0.16, "grad_norm": 4.8125, "learning_rate": 4.949552749160498e-06, "logits/chosen": -1.5391428470611572, "logits/rejected": -1.1120718717575073, "logps/chosen": -501.46636962890625, "logps/rejected": -546.97509765625, "loss": 0.4589, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1456356048583984, "rewards/margins": 1.0223822593688965, "rewards/rejected": -3.1680173873901367, "step": 2410 }, { "epoch": 0.16, "grad_norm": 11.8125, "learning_rate": 4.9484050003612115e-06, "logits/chosen": -1.2480593919754028, "logits/rejected": -1.3148705959320068, "logps/chosen": -469.01495361328125, "logps/rejected": -528.4212036132812, "loss": 0.6073, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.176260232925415, "rewards/margins": 0.7273812294006348, "rewards/rejected": -2.903641700744629, "step": 2420 }, { "epoch": 0.16, "grad_norm": 8.25, "learning_rate": 4.947244477624796e-06, "logits/chosen": -1.374891996383667, "logits/rejected": -1.1316957473754883, "logps/chosen": -488.27178955078125, "logps/rejected": -602.121826171875, "loss": 0.6098, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2364022731781006, "rewards/margins": 1.1014007329940796, "rewards/rejected": -3.337803363800049, "step": 2430 }, { "epoch": 0.16, "grad_norm": 52.0, "learning_rate": 4.9460711870059866e-06, "logits/chosen": -1.5563279390335083, "logits/rejected": -0.8609926104545593, "logps/chosen": -433.05560302734375, "logps/rejected": -425.2903747558594, "loss": 0.5206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6424890756607056, "rewards/margins": 0.8093570470809937, "rewards/rejected": -2.4518463611602783, "step": 2440 }, { "epoch": 0.16, "grad_norm": 10.1875, "learning_rate": 4.9448851346261305e-06, "logits/chosen": -1.3388367891311646, "logits/rejected": -1.0877244472503662, "logps/chosen": -494.03863525390625, "logps/rejected": -485.5228576660156, "loss": 0.7008, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.141556739807129, "rewards/margins": 0.4954777657985687, "rewards/rejected": -2.6370344161987305, "step": 2450 }, { "epoch": 0.16, "grad_norm": 27.125, "learning_rate": 4.943686326673159e-06, "logits/chosen": -0.9239088296890259, "logits/rejected": -0.8298670649528503, "logps/chosen": -507.1578063964844, "logps/rejected": -585.5223999023438, "loss": 0.6534, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.640493869781494, "rewards/margins": 0.8850871920585632, "rewards/rejected": -3.525580883026123, "step": 2460 }, { "epoch": 0.16, "grad_norm": 32.5, "learning_rate": 4.942474769401549e-06, "logits/chosen": -1.3317737579345703, "logits/rejected": -1.1517871618270874, "logps/chosen": -545.4752807617188, "logps/rejected": -610.5505981445312, "loss": 0.5912, "rewards/accuracies": 0.75, "rewards/chosen": -2.6572036743164062, "rewards/margins": 0.9399458169937134, "rewards/rejected": -3.597149610519409, "step": 2470 }, { "epoch": 0.16, "grad_norm": 21.625, "learning_rate": 4.941250469132298e-06, "logits/chosen": -1.2961399555206299, "logits/rejected": -0.5644221305847168, "logps/chosen": -419.9778747558594, "logps/rejected": -529.6748657226562, "loss": 0.3847, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8336149454116821, "rewards/margins": 1.5583337545394897, "rewards/rejected": -3.391948699951172, "step": 2480 }, { "epoch": 0.16, "grad_norm": 41.0, "learning_rate": 4.940013432252884e-06, "logits/chosen": -1.1810494661331177, "logits/rejected": -0.9430990219116211, "logps/chosen": -412.9159240722656, "logps/rejected": -490.522216796875, "loss": 0.4464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9292335510253906, "rewards/margins": 1.1685705184936523, "rewards/rejected": -3.097804307937622, "step": 2490 }, { "epoch": 0.16, "grad_norm": 25.75, "learning_rate": 4.938763665217235e-06, "logits/chosen": -0.8738414645195007, "logits/rejected": -0.9322101473808289, "logps/chosen": -429.00775146484375, "logps/rejected": -556.90380859375, "loss": 0.5232, "rewards/accuracies": 0.75, "rewards/chosen": -2.2805495262145996, "rewards/margins": 1.2450988292694092, "rewards/rejected": -3.525648593902588, "step": 2500 }, { "epoch": 0.16, "eval_logits/chosen": -1.2295386791229248, "eval_logits/rejected": -0.9054907560348511, "eval_logps/chosen": -489.4522399902344, "eval_logps/rejected": -570.016845703125, "eval_loss": 0.5785664319992065, "eval_rewards/accuracies": 0.7080000042915344, "eval_rewards/chosen": -2.2483270168304443, "eval_rewards/margins": 1.00612473487854, "eval_rewards/rejected": -3.2544517517089844, "eval_runtime": 1082.2439, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 2500 }, { "epoch": 0.16, "grad_norm": 7.0625, "learning_rate": 4.937501174545697e-06, "logits/chosen": -1.3089654445648193, "logits/rejected": -1.1359285116195679, "logps/chosen": -450.65106201171875, "logps/rejected": -520.7359619140625, "loss": 0.6274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1344285011291504, "rewards/margins": 0.6826275587081909, "rewards/rejected": -2.817056179046631, "step": 2510 }, { "epoch": 0.16, "grad_norm": 35.75, "learning_rate": 4.936225966824997e-06, "logits/chosen": -1.023093342781067, "logits/rejected": -1.0152443647384644, "logps/chosen": -482.58587646484375, "logps/rejected": -524.8193359375, "loss": 0.7202, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.329948663711548, "rewards/margins": 0.7868360280990601, "rewards/rejected": -3.1167845726013184, "step": 2520 }, { "epoch": 0.17, "grad_norm": 36.5, "learning_rate": 4.93493804870821e-06, "logits/chosen": -1.168610692024231, "logits/rejected": -1.0791141986846924, "logps/chosen": -405.5086975097656, "logps/rejected": -467.8145446777344, "loss": 0.4804, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6960151195526123, "rewards/margins": 1.001037359237671, "rewards/rejected": -2.697052478790283, "step": 2530 }, { "epoch": 0.17, "grad_norm": 17.125, "learning_rate": 4.933637426914726e-06, "logits/chosen": -1.3853102922439575, "logits/rejected": -1.0201982259750366, "logps/chosen": -512.2803955078125, "logps/rejected": -521.1387939453125, "loss": 0.626, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1194682121276855, "rewards/margins": 0.5509535074234009, "rewards/rejected": -2.670422315597534, "step": 2540 }, { "epoch": 0.17, "grad_norm": 8.5625, "learning_rate": 4.932324108230211e-06, "logits/chosen": -1.108056664466858, "logits/rejected": -0.5687418580055237, "logps/chosen": -454.097900390625, "logps/rejected": -568.02734375, "loss": 0.598, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2652018070220947, "rewards/margins": 0.9240036010742188, "rewards/rejected": -3.1892056465148926, "step": 2550 }, { "epoch": 0.17, "grad_norm": 7.9375, "learning_rate": 4.9309980995065755e-06, "logits/chosen": -1.1510646343231201, "logits/rejected": -0.8338361978530884, "logps/chosen": -486.3438415527344, "logps/rejected": -531.1924438476562, "loss": 0.6256, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.335334539413452, "rewards/margins": 0.9830752611160278, "rewards/rejected": -3.3184096813201904, "step": 2560 }, { "epoch": 0.17, "grad_norm": 25.375, "learning_rate": 4.929659407661935e-06, "logits/chosen": -0.701055109500885, "logits/rejected": -0.3426407277584076, "logps/chosen": -517.5845947265625, "logps/rejected": -509.13360595703125, "loss": 0.6481, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.3730268478393555, "rewards/margins": 0.6936885714530945, "rewards/rejected": -3.0667152404785156, "step": 2570 }, { "epoch": 0.17, "grad_norm": 37.0, "learning_rate": 4.928308039680579e-06, "logits/chosen": -0.9917882680892944, "logits/rejected": -0.6612027883529663, "logps/chosen": -496.33526611328125, "logps/rejected": -588.1357421875, "loss": 0.551, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.526604175567627, "rewards/margins": 0.8510949015617371, "rewards/rejected": -3.3776988983154297, "step": 2580 }, { "epoch": 0.17, "grad_norm": 12.0625, "learning_rate": 4.92694400261293e-06, "logits/chosen": -1.0283622741699219, "logits/rejected": -0.8825260996818542, "logps/chosen": -512.4873657226562, "logps/rejected": -568.0089111328125, "loss": 0.6335, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5195558071136475, "rewards/margins": 0.7351851463317871, "rewards/rejected": -3.2547411918640137, "step": 2590 }, { "epoch": 0.17, "grad_norm": 14.75, "learning_rate": 4.925567303575507e-06, "logits/chosen": -1.3967078924179077, "logits/rejected": -0.7938148975372314, "logps/chosen": -574.72802734375, "logps/rejected": -563.0109252929688, "loss": 0.624, "rewards/accuracies": 0.75, "rewards/chosen": -2.569070339202881, "rewards/margins": 0.5729259252548218, "rewards/rejected": -3.141996383666992, "step": 2600 }, { "epoch": 0.17, "eval_logits/chosen": -1.1030482053756714, "eval_logits/rejected": -0.8068652153015137, "eval_logps/chosen": -503.64947509765625, "eval_logps/rejected": -565.4991455078125, "eval_loss": 0.5485803484916687, "eval_rewards/accuracies": 0.7210000157356262, "eval_rewards/chosen": -2.390298366546631, "eval_rewards/margins": 0.8189761638641357, "eval_rewards/rejected": -3.2092745304107666, "eval_runtime": 1082.2042, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 2600 }, { "epoch": 0.17, "grad_norm": 4.40625, "learning_rate": 4.924177949750893e-06, "logits/chosen": -0.9705875515937805, "logits/rejected": -0.4868856370449066, "logps/chosen": -500.82415771484375, "logps/rejected": -584.556396484375, "loss": 0.4355, "rewards/accuracies": 0.875, "rewards/chosen": -2.4950547218322754, "rewards/margins": 1.049059271812439, "rewards/rejected": -3.544114351272583, "step": 2610 }, { "epoch": 0.17, "grad_norm": 10.5625, "learning_rate": 4.922775948387689e-06, "logits/chosen": -0.7621237635612488, "logits/rejected": -0.11592018604278564, "logps/chosen": -467.1397399902344, "logps/rejected": -552.8258056640625, "loss": 0.4303, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2573916912078857, "rewards/margins": 1.5005296468734741, "rewards/rejected": -3.7579212188720703, "step": 2620 }, { "epoch": 0.17, "grad_norm": 23.75, "learning_rate": 4.921361306800486e-06, "logits/chosen": -0.9282606244087219, "logits/rejected": -0.38205426931381226, "logps/chosen": -550.5037231445312, "logps/rejected": -632.0145263671875, "loss": 0.5842, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.8819689750671387, "rewards/margins": 1.0414502620697021, "rewards/rejected": -3.923419952392578, "step": 2630 }, { "epoch": 0.17, "grad_norm": 35.5, "learning_rate": 4.919934032369821e-06, "logits/chosen": -1.0721185207366943, "logits/rejected": -0.5342406034469604, "logps/chosen": -498.97332763671875, "logps/rejected": -669.56689453125, "loss": 0.5485, "rewards/accuracies": 0.75, "rewards/chosen": -2.7788033485412598, "rewards/margins": 1.5302320718765259, "rewards/rejected": -4.309035301208496, "step": 2640 }, { "epoch": 0.17, "grad_norm": 7.15625, "learning_rate": 4.91849413254214e-06, "logits/chosen": -1.1485934257507324, "logits/rejected": -1.2234007120132446, "logps/chosen": -531.4505615234375, "logps/rejected": -676.8238525390625, "loss": 0.4828, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.506082057952881, "rewards/margins": 1.1689486503601074, "rewards/rejected": -3.6750309467315674, "step": 2650 }, { "epoch": 0.17, "grad_norm": 28.25, "learning_rate": 4.917041614829755e-06, "logits/chosen": -0.7964043021202087, "logits/rejected": -0.5896933674812317, "logps/chosen": -488.67083740234375, "logps/rejected": -565.3200073242188, "loss": 0.5343, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6585452556610107, "rewards/margins": 0.8475700616836548, "rewards/rejected": -3.506115436553955, "step": 2660 }, { "epoch": 0.17, "grad_norm": 28.75, "learning_rate": 4.915576486810816e-06, "logits/chosen": -1.0134162902832031, "logits/rejected": -0.6163747310638428, "logps/chosen": -563.9337768554688, "logps/rejected": -618.1134643554688, "loss": 0.6083, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.9627692699432373, "rewards/margins": 0.9765104055404663, "rewards/rejected": -3.9392800331115723, "step": 2670 }, { "epoch": 0.18, "grad_norm": 20.25, "learning_rate": 4.914098756129256e-06, "logits/chosen": -1.1813762187957764, "logits/rejected": -0.8586546778678894, "logps/chosen": -521.541748046875, "logps/rejected": -616.39892578125, "loss": 0.6021, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.619070529937744, "rewards/margins": 0.8620538711547852, "rewards/rejected": -3.48112416267395, "step": 2680 }, { "epoch": 0.18, "grad_norm": 12.3125, "learning_rate": 4.912608430494765e-06, "logits/chosen": -0.9947369694709778, "logits/rejected": -0.9067084193229675, "logps/chosen": -554.2762451171875, "logps/rejected": -607.7996215820312, "loss": 0.6935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.714287281036377, "rewards/margins": 0.6404015421867371, "rewards/rejected": -3.354689121246338, "step": 2690 }, { "epoch": 0.18, "grad_norm": 32.25, "learning_rate": 4.9111055176827415e-06, "logits/chosen": -0.9540726542472839, "logits/rejected": -0.9945276975631714, "logps/chosen": -499.77166748046875, "logps/rejected": -582.8146362304688, "loss": 0.7293, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.7602155208587646, "rewards/margins": 0.4538514018058777, "rewards/rejected": -3.214066982269287, "step": 2700 }, { "epoch": 0.18, "eval_logits/chosen": -1.2653329372406006, "eval_logits/rejected": -1.0068553686141968, "eval_logps/chosen": -496.885498046875, "eval_logps/rejected": -544.9945678710938, "eval_loss": 0.5603376626968384, "eval_rewards/accuracies": 0.7024999856948853, "eval_rewards/chosen": -2.3226590156555176, "eval_rewards/margins": 0.6815701723098755, "eval_rewards/rejected": -3.0042290687561035, "eval_runtime": 1081.8002, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 2700 }, { "epoch": 0.18, "grad_norm": 14.125, "learning_rate": 4.909590025534255e-06, "logits/chosen": -1.4002914428710938, "logits/rejected": -1.0992753505706787, "logps/chosen": -468.275634765625, "logps/rejected": -539.5154418945312, "loss": 0.4896, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1641759872436523, "rewards/margins": 0.7119359374046326, "rewards/rejected": -2.8761115074157715, "step": 2710 }, { "epoch": 0.18, "grad_norm": 10.9375, "learning_rate": 4.908061961956006e-06, "logits/chosen": -1.1610498428344727, "logits/rejected": -1.0747697353363037, "logps/chosen": -387.88970947265625, "logps/rejected": -474.09002685546875, "loss": 0.5676, "rewards/accuracies": 0.75, "rewards/chosen": -1.961199164390564, "rewards/margins": 0.7550275921821594, "rewards/rejected": -2.716226816177368, "step": 2720 }, { "epoch": 0.18, "grad_norm": 29.0, "learning_rate": 4.90652133492028e-06, "logits/chosen": -1.4776216745376587, "logits/rejected": -0.9607813954353333, "logps/chosen": -556.5850830078125, "logps/rejected": -642.3138427734375, "loss": 0.535, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2682719230651855, "rewards/margins": 0.9128721952438354, "rewards/rejected": -3.1811439990997314, "step": 2730 }, { "epoch": 0.18, "grad_norm": 14.125, "learning_rate": 4.904968152464911e-06, "logits/chosen": -0.9095733761787415, "logits/rejected": -0.34039565920829773, "logps/chosen": -448.2177734375, "logps/rejected": -515.9580078125, "loss": 0.4717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.124785900115967, "rewards/margins": 1.1040289402008057, "rewards/rejected": -3.2288146018981934, "step": 2740 }, { "epoch": 0.18, "grad_norm": 7.1875, "learning_rate": 4.903402422693239e-06, "logits/chosen": -1.164085030555725, "logits/rejected": -0.5443894267082214, "logps/chosen": -518.2283325195312, "logps/rejected": -623.6531982421875, "loss": 0.4888, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.341888189315796, "rewards/margins": 1.3275411128997803, "rewards/rejected": -3.6694297790527344, "step": 2750 }, { "epoch": 0.18, "grad_norm": 21.75, "learning_rate": 4.901824153774064e-06, "logits/chosen": -0.44409093260765076, "logits/rejected": -0.160577654838562, "logps/chosen": -523.2752075195312, "logps/rejected": -656.0370483398438, "loss": 0.4891, "rewards/accuracies": 0.75, "rewards/chosen": -2.581050395965576, "rewards/margins": 1.341514229774475, "rewards/rejected": -3.922564744949341, "step": 2760 }, { "epoch": 0.18, "grad_norm": 40.0, "learning_rate": 4.900233353941608e-06, "logits/chosen": -1.100475549697876, "logits/rejected": -0.6773032546043396, "logps/chosen": -489.81170654296875, "logps/rejected": -620.32421875, "loss": 0.4308, "rewards/accuracies": 0.75, "rewards/chosen": -2.5852653980255127, "rewards/margins": 1.369653344154358, "rewards/rejected": -3.954918622970581, "step": 2770 }, { "epoch": 0.18, "grad_norm": 24.5, "learning_rate": 4.898630031495467e-06, "logits/chosen": -1.0070301294326782, "logits/rejected": -0.759345531463623, "logps/chosen": -538.4212646484375, "logps/rejected": -589.982421875, "loss": 0.5798, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6859207153320312, "rewards/margins": 0.8809080123901367, "rewards/rejected": -3.566828966140747, "step": 2780 }, { "epoch": 0.18, "grad_norm": 16.75, "learning_rate": 4.897014194800575e-06, "logits/chosen": -0.9060072898864746, "logits/rejected": -0.18769614398479462, "logps/chosen": -543.5404052734375, "logps/rejected": -566.7427978515625, "loss": 0.6263, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.83385968208313, "rewards/margins": 0.7570546865463257, "rewards/rejected": -3.590914249420166, "step": 2790 }, { "epoch": 0.18, "grad_norm": 9.5625, "learning_rate": 4.895385852287152e-06, "logits/chosen": -1.0265872478485107, "logits/rejected": -1.017946481704712, "logps/chosen": -495.6697692871094, "logps/rejected": -626.3646240234375, "loss": 0.4734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4274399280548096, "rewards/margins": 1.2342894077301025, "rewards/rejected": -3.661729335784912, "step": 2800 }, { "epoch": 0.18, "eval_logits/chosen": -0.9492429494857788, "eval_logits/rejected": -0.6144876480102539, "eval_logps/chosen": -518.4933471679688, "eval_logps/rejected": -604.3603515625, "eval_loss": 0.5765022039413452, "eval_rewards/accuracies": 0.7099999785423279, "eval_rewards/chosen": -2.5387380123138428, "eval_rewards/margins": 1.059149146080017, "eval_rewards/rejected": -3.5978872776031494, "eval_runtime": 1082.1716, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 2800 }, { "epoch": 0.18, "grad_norm": 37.75, "learning_rate": 4.893745012450666e-06, "logits/chosen": -0.7495434880256653, "logits/rejected": -0.38999444246292114, "logps/chosen": -563.0011596679688, "logps/rejected": -596.3585815429688, "loss": 0.6817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.0786445140838623, "rewards/margins": 0.6129933595657349, "rewards/rejected": -3.691638231277466, "step": 2810 }, { "epoch": 0.18, "grad_norm": 11.625, "learning_rate": 4.892091683851785e-06, "logits/chosen": -0.9500097036361694, "logits/rejected": -0.6063931584358215, "logps/chosen": -550.6644287109375, "logps/rejected": -672.7066040039062, "loss": 0.6187, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.86037015914917, "rewards/margins": 0.9597105979919434, "rewards/rejected": -3.8200809955596924, "step": 2820 }, { "epoch": 0.19, "grad_norm": 17.75, "learning_rate": 4.890425875116337e-06, "logits/chosen": -1.3380236625671387, "logits/rejected": -0.9313468933105469, "logps/chosen": -500.8341369628906, "logps/rejected": -565.7659912109375, "loss": 0.6298, "rewards/accuracies": 0.625, "rewards/chosen": -2.383645534515381, "rewards/margins": 0.9058547019958496, "rewards/rejected": -3.2895004749298096, "step": 2830 }, { "epoch": 0.19, "grad_norm": 19.25, "learning_rate": 4.888747594935259e-06, "logits/chosen": -1.3680202960968018, "logits/rejected": -1.152977705001831, "logps/chosen": -409.97003173828125, "logps/rejected": -485.0283203125, "loss": 0.5589, "rewards/accuracies": 0.625, "rewards/chosen": -1.8007583618164062, "rewards/margins": 0.8031457662582397, "rewards/rejected": -2.6039042472839355, "step": 2840 }, { "epoch": 0.19, "grad_norm": 10.875, "learning_rate": 4.887056852064555e-06, "logits/chosen": -1.27714204788208, "logits/rejected": -1.1043390035629272, "logps/chosen": -353.5475158691406, "logps/rejected": -440.7120056152344, "loss": 0.4436, "rewards/accuracies": 0.75, "rewards/chosen": -1.307898759841919, "rewards/margins": 0.922389030456543, "rewards/rejected": -2.230287790298462, "step": 2850 }, { "epoch": 0.19, "grad_norm": 7.8125, "learning_rate": 4.8853536553252505e-06, "logits/chosen": -1.3724300861358643, "logits/rejected": -1.1606223583221436, "logps/chosen": -483.1075134277344, "logps/rejected": -508.71051025390625, "loss": 0.5251, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8511075973510742, "rewards/margins": 0.756685733795166, "rewards/rejected": -2.6077935695648193, "step": 2860 }, { "epoch": 0.19, "grad_norm": 24.25, "learning_rate": 4.8836380136033465e-06, "logits/chosen": -1.5302479267120361, "logits/rejected": -1.2814452648162842, "logps/chosen": -486.3905334472656, "logps/rejected": -574.9298095703125, "loss": 0.5883, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.986690878868103, "rewards/margins": 0.845594048500061, "rewards/rejected": -2.832284927368164, "step": 2870 }, { "epoch": 0.19, "grad_norm": 18.375, "learning_rate": 4.881909935849772e-06, "logits/chosen": -1.3302277326583862, "logits/rejected": -1.043800711631775, "logps/chosen": -461.4295959472656, "logps/rejected": -524.9710693359375, "loss": 0.4936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1624107360839844, "rewards/margins": 1.1703975200653076, "rewards/rejected": -3.332808256149292, "step": 2880 }, { "epoch": 0.19, "grad_norm": 12.0625, "learning_rate": 4.8801694310803394e-06, "logits/chosen": -0.7241562604904175, "logits/rejected": -0.5024352073669434, "logps/chosen": -463.110107421875, "logps/rejected": -595.5580444335938, "loss": 0.606, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6411948204040527, "rewards/margins": 1.2462457418441772, "rewards/rejected": -3.8874409198760986, "step": 2890 }, { "epoch": 0.19, "grad_norm": 4.5, "learning_rate": 4.878416508375692e-06, "logits/chosen": -0.9864310026168823, "logits/rejected": -1.1451997756958008, "logps/chosen": -482.9327087402344, "logps/rejected": -638.0906982421875, "loss": 0.5551, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.7507164478302, "rewards/margins": 1.2867149114608765, "rewards/rejected": -4.037430763244629, "step": 2900 }, { "epoch": 0.19, "eval_logits/chosen": -1.0007816553115845, "eval_logits/rejected": -0.6866695880889893, "eval_logps/chosen": -562.2118530273438, "eval_logps/rejected": -658.637451171875, "eval_loss": 0.5749436616897583, "eval_rewards/accuracies": 0.7105000019073486, "eval_rewards/chosen": -2.9759230613708496, "eval_rewards/margins": 1.164734959602356, "eval_rewards/rejected": -4.140657901763916, "eval_runtime": 1082.6655, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 2900 }, { "epoch": 0.19, "grad_norm": 14.9375, "learning_rate": 4.876651176881264e-06, "logits/chosen": -0.9226648211479187, "logits/rejected": -0.5771136283874512, "logps/chosen": -526.1315307617188, "logps/rejected": -572.4791259765625, "loss": 0.6871, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.136918783187866, "rewards/margins": 0.6927804350852966, "rewards/rejected": -3.8296992778778076, "step": 2910 }, { "epoch": 0.19, "grad_norm": 9.625, "learning_rate": 4.874873445807229e-06, "logits/chosen": -1.1301019191741943, "logits/rejected": -0.5384422540664673, "logps/chosen": -531.8717651367188, "logps/rejected": -729.2259521484375, "loss": 0.4178, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8012874126434326, "rewards/margins": 1.6119840145111084, "rewards/rejected": -4.413271903991699, "step": 2920 }, { "epoch": 0.19, "grad_norm": 108.5, "learning_rate": 4.8730833244284505e-06, "logits/chosen": -0.489138126373291, "logits/rejected": 0.09619824588298798, "logps/chosen": -635.22412109375, "logps/rejected": -696.9118041992188, "loss": 0.6432, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7008774280548096, "rewards/margins": 0.9634177088737488, "rewards/rejected": -4.664295196533203, "step": 2930 }, { "epoch": 0.19, "grad_norm": 11.5625, "learning_rate": 4.871280822084438e-06, "logits/chosen": -1.0972245931625366, "logits/rejected": -0.5302432179450989, "logps/chosen": -651.4733276367188, "logps/rejected": -704.0028076171875, "loss": 0.6773, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.5298924446105957, "rewards/margins": 0.9965899586677551, "rewards/rejected": -4.526482582092285, "step": 2940 }, { "epoch": 0.19, "grad_norm": 61.25, "learning_rate": 4.869465948179293e-06, "logits/chosen": -1.012540578842163, "logits/rejected": -0.8276836276054382, "logps/chosen": -532.4972534179688, "logps/rejected": -662.3926391601562, "loss": 0.5076, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.967895030975342, "rewards/margins": 1.4679548740386963, "rewards/rejected": -4.435849666595459, "step": 2950 }, { "epoch": 0.19, "grad_norm": 25.0, "learning_rate": 4.867638712181663e-06, "logits/chosen": -1.1108969449996948, "logits/rejected": -0.6297051310539246, "logps/chosen": -480.4386291503906, "logps/rejected": -600.0165405273438, "loss": 0.6666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.069643497467041, "rewards/margins": 0.808964729309082, "rewards/rejected": -3.878607988357544, "step": 2960 }, { "epoch": 0.19, "grad_norm": 23.75, "learning_rate": 4.865799123624692e-06, "logits/chosen": -1.3721752166748047, "logits/rejected": -0.7209434509277344, "logps/chosen": -531.6541748046875, "logps/rejected": -584.013671875, "loss": 0.5447, "rewards/accuracies": 0.75, "rewards/chosen": -2.792494058609009, "rewards/margins": 1.0497827529907227, "rewards/rejected": -3.8422768115997314, "step": 2970 }, { "epoch": 0.19, "grad_norm": 13.375, "learning_rate": 4.863947192105973e-06, "logits/chosen": -1.5659619569778442, "logits/rejected": -0.9766691327095032, "logps/chosen": -554.1716918945312, "logps/rejected": -654.83837890625, "loss": 0.5688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.956875801086426, "rewards/margins": 1.1680063009262085, "rewards/rejected": -4.124882221221924, "step": 2980 }, { "epoch": 0.2, "grad_norm": 10.4375, "learning_rate": 4.862082927287491e-06, "logits/chosen": -1.4950758218765259, "logits/rejected": -0.986733078956604, "logps/chosen": -467.541015625, "logps/rejected": -582.5408935546875, "loss": 0.4558, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1998915672302246, "rewards/margins": 1.179106593132019, "rewards/rejected": -3.378998279571533, "step": 2990 }, { "epoch": 0.2, "grad_norm": 8.3125, "learning_rate": 4.860206338895578e-06, "logits/chosen": -1.4583523273468018, "logits/rejected": -1.2290408611297607, "logps/chosen": -525.3323974609375, "logps/rejected": -592.743896484375, "loss": 0.7045, "rewards/accuracies": 0.75, "rewards/chosen": -2.884678363800049, "rewards/margins": 0.6703182458877563, "rewards/rejected": -3.554996967315674, "step": 3000 }, { "epoch": 0.2, "eval_logits/chosen": -1.4700161218643188, "eval_logits/rejected": -1.134680151939392, "eval_logps/chosen": -542.4956665039062, "eval_logps/rejected": -631.8784790039062, "eval_loss": 0.5745168328285217, "eval_rewards/accuracies": 0.7210000157356262, "eval_rewards/chosen": -2.7787601947784424, "eval_rewards/margins": 1.09430730342865, "eval_rewards/rejected": -3.873067855834961, "eval_runtime": 1082.1479, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 3000 }, { "epoch": 0.2, "grad_norm": 7.9375, "learning_rate": 4.858317436720862e-06, "logits/chosen": -1.5180243253707886, "logits/rejected": -1.1232070922851562, "logps/chosen": -601.0726318359375, "logps/rejected": -682.299072265625, "loss": 0.6226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7825140953063965, "rewards/margins": 1.0207786560058594, "rewards/rejected": -3.803292751312256, "step": 3010 }, { "epoch": 0.2, "grad_norm": 14.3125, "learning_rate": 4.856416230618216e-06, "logits/chosen": -1.4747172594070435, "logits/rejected": -1.2326972484588623, "logps/chosen": -575.8563232421875, "logps/rejected": -626.7279663085938, "loss": 0.4909, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6182587146759033, "rewards/margins": 1.1161049604415894, "rewards/rejected": -3.7343640327453613, "step": 3020 }, { "epoch": 0.2, "grad_norm": 5.78125, "learning_rate": 4.854502730506704e-06, "logits/chosen": -1.3093783855438232, "logits/rejected": -0.7401999235153198, "logps/chosen": -483.31634521484375, "logps/rejected": -532.5453491210938, "loss": 0.5335, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4433138370513916, "rewards/margins": 0.8713593482971191, "rewards/rejected": -3.3146729469299316, "step": 3030 }, { "epoch": 0.2, "grad_norm": 28.125, "learning_rate": 4.852576946369532e-06, "logits/chosen": -1.1920539140701294, "logits/rejected": -1.032806634902954, "logps/chosen": -592.3204345703125, "logps/rejected": -672.8058471679688, "loss": 0.4701, "rewards/accuracies": 0.75, "rewards/chosen": -2.7370924949645996, "rewards/margins": 1.1364582777023315, "rewards/rejected": -3.8735508918762207, "step": 3040 }, { "epoch": 0.2, "grad_norm": 81.5, "learning_rate": 4.850638888253992e-06, "logits/chosen": -0.959607720375061, "logits/rejected": -1.0059874057769775, "logps/chosen": -589.0208129882812, "logps/rejected": -707.7827758789062, "loss": 0.5377, "rewards/accuracies": 0.75, "rewards/chosen": -3.255613327026367, "rewards/margins": 1.404503583908081, "rewards/rejected": -4.660116672515869, "step": 3050 }, { "epoch": 0.2, "grad_norm": 11.6875, "learning_rate": 4.848688566271418e-06, "logits/chosen": -1.0182592868804932, "logits/rejected": -0.6390029788017273, "logps/chosen": -550.360107421875, "logps/rejected": -630.9844970703125, "loss": 0.4956, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.302762985229492, "rewards/margins": 1.0322041511535645, "rewards/rejected": -4.334967613220215, "step": 3060 }, { "epoch": 0.2, "grad_norm": 8.4375, "learning_rate": 4.846725990597122e-06, "logits/chosen": -1.1571100950241089, "logits/rejected": -0.6310211420059204, "logps/chosen": -614.6341552734375, "logps/rejected": -618.8116455078125, "loss": 0.8476, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.6892783641815186, "rewards/margins": 0.4788757860660553, "rewards/rejected": -4.168154239654541, "step": 3070 }, { "epoch": 0.2, "grad_norm": 27.375, "learning_rate": 4.8447511714703495e-06, "logits/chosen": -1.1612187623977661, "logits/rejected": -0.9569363594055176, "logps/chosen": -591.1726684570312, "logps/rejected": -615.412353515625, "loss": 0.7817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4095795154571533, "rewards/margins": 0.7850635647773743, "rewards/rejected": -4.194643020629883, "step": 3080 }, { "epoch": 0.2, "grad_norm": 25.75, "learning_rate": 4.842764119194222e-06, "logits/chosen": -1.6727104187011719, "logits/rejected": -1.4156793355941772, "logps/chosen": -681.1204833984375, "logps/rejected": -759.1964111328125, "loss": 0.4908, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.262073040008545, "rewards/margins": 1.2252382040023804, "rewards/rejected": -4.487311363220215, "step": 3090 }, { "epoch": 0.2, "grad_norm": 30.625, "learning_rate": 4.840764844135686e-06, "logits/chosen": -1.1525264978408813, "logits/rejected": -0.8894956707954407, "logps/chosen": -613.6727905273438, "logps/rejected": -628.007568359375, "loss": 0.732, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.5448100566864014, "rewards/margins": 0.5728045105934143, "rewards/rejected": -4.11761474609375, "step": 3100 }, { "epoch": 0.2, "eval_logits/chosen": -1.204930067062378, "eval_logits/rejected": -0.8125373125076294, "eval_logps/chosen": -638.6746215820312, "eval_logps/rejected": -727.5560302734375, "eval_loss": 0.5703139305114746, "eval_rewards/accuracies": 0.7149999737739563, "eval_rewards/chosen": -3.7405505180358887, "eval_rewards/margins": 1.0892930030822754, "eval_rewards/rejected": -4.829843521118164, "eval_runtime": 1082.2611, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 3100 }, { "epoch": 0.2, "grad_norm": 102.5, "learning_rate": 4.838753356725455e-06, "logits/chosen": -1.2993721961975098, "logits/rejected": -0.9698923230171204, "logps/chosen": -613.9100341796875, "logps/rejected": -721.1390380859375, "loss": 0.5943, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.647103786468506, "rewards/margins": 1.279939889907837, "rewards/rejected": -4.92704439163208, "step": 3110 }, { "epoch": 0.2, "grad_norm": 11.6875, "learning_rate": 4.83672966745796e-06, "logits/chosen": -0.5505729913711548, "logits/rejected": -0.708713948726654, "logps/chosen": -721.4688720703125, "logps/rejected": -827.0159912109375, "loss": 0.5779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.423375129699707, "rewards/margins": 0.9695072174072266, "rewards/rejected": -5.392881870269775, "step": 3120 }, { "epoch": 0.2, "grad_norm": 11.125, "learning_rate": 4.8346937868912885e-06, "logits/chosen": -0.6967655420303345, "logits/rejected": -0.5530049204826355, "logps/chosen": -762.2486572265625, "logps/rejected": -815.1415405273438, "loss": 0.5236, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.236546516418457, "rewards/margins": 0.922626793384552, "rewards/rejected": -6.159173011779785, "step": 3130 }, { "epoch": 0.21, "grad_norm": 22.75, "learning_rate": 4.832645725647137e-06, "logits/chosen": -0.35282713174819946, "logits/rejected": -0.4759860634803772, "logps/chosen": -860.7420043945312, "logps/rejected": -945.7960815429688, "loss": 0.7344, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.518100738525391, "rewards/margins": 0.5187007784843445, "rewards/rejected": -6.036801338195801, "step": 3140 }, { "epoch": 0.21, "grad_norm": 67.5, "learning_rate": 4.830585494410749e-06, "logits/chosen": -0.844907283782959, "logits/rejected": -0.20988717675209045, "logps/chosen": -759.290771484375, "logps/rejected": -799.6171264648438, "loss": 0.5006, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.059080600738525, "rewards/margins": 1.0010488033294678, "rewards/rejected": -6.060129642486572, "step": 3150 }, { "epoch": 0.21, "grad_norm": 13.3125, "learning_rate": 4.828513103930862e-06, "logits/chosen": -0.9502388834953308, "logits/rejected": -0.3857150971889496, "logps/chosen": -694.3543701171875, "logps/rejected": -820.5437622070312, "loss": 0.5072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.0775837898254395, "rewards/margins": 1.699538230895996, "rewards/rejected": -5.777121543884277, "step": 3160 }, { "epoch": 0.21, "grad_norm": 19.625, "learning_rate": 4.826428565019653e-06, "logits/chosen": -1.1060038805007935, "logits/rejected": -0.5731886029243469, "logps/chosen": -537.85888671875, "logps/rejected": -696.0828857421875, "loss": 0.4296, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.039494037628174, "rewards/margins": 1.5998737812042236, "rewards/rejected": -4.63936710357666, "step": 3170 }, { "epoch": 0.21, "grad_norm": 11.75, "learning_rate": 4.82433188855268e-06, "logits/chosen": -1.3443410396575928, "logits/rejected": -0.843428909778595, "logps/chosen": -643.633056640625, "logps/rejected": -738.4778442382812, "loss": 0.4441, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.394624710083008, "rewards/margins": 1.1966426372528076, "rewards/rejected": -4.5912675857543945, "step": 3180 }, { "epoch": 0.21, "grad_norm": 153.0, "learning_rate": 4.822223085468823e-06, "logits/chosen": -1.193100094795227, "logits/rejected": -0.7550846934318542, "logps/chosen": -647.3896484375, "logps/rejected": -703.3218383789062, "loss": 0.6901, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.106610298156738, "rewards/margins": 0.9782665371894836, "rewards/rejected": -5.084876537322998, "step": 3190 }, { "epoch": 0.21, "grad_norm": 7.375, "learning_rate": 4.820102166770233e-06, "logits/chosen": -1.4512341022491455, "logits/rejected": -1.4257360696792603, "logps/chosen": -481.8994140625, "logps/rejected": -597.9351806640625, "loss": 0.585, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.475877046585083, "rewards/margins": 0.7811436057090759, "rewards/rejected": -3.2570204734802246, "step": 3200 }, { "epoch": 0.21, "eval_logits/chosen": -1.6495109796524048, "eval_logits/rejected": -1.389194130897522, "eval_logps/chosen": -504.2574768066406, "eval_logps/rejected": -566.1844482421875, "eval_loss": 0.5682303309440613, "eval_rewards/accuracies": 0.7049999833106995, "eval_rewards/chosen": -2.396378517150879, "eval_rewards/margins": 0.8197490572929382, "eval_rewards/rejected": -3.216127395629883, "eval_runtime": 1082.1448, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 3200 }, { "epoch": 0.21, "grad_norm": 9.5, "learning_rate": 4.817969143522269e-06, "logits/chosen": -1.9869191646575928, "logits/rejected": -1.5637187957763672, "logps/chosen": -477.1033630371094, "logps/rejected": -604.8731079101562, "loss": 0.4612, "rewards/accuracies": 0.875, "rewards/chosen": -2.103269100189209, "rewards/margins": 1.138044834136963, "rewards/rejected": -3.241313934326172, "step": 3210 }, { "epoch": 0.21, "grad_norm": 10.3125, "learning_rate": 4.815824026853444e-06, "logits/chosen": -1.4754489660263062, "logits/rejected": -1.4228181838989258, "logps/chosen": -498.0628356933594, "logps/rejected": -656.4076538085938, "loss": 0.6202, "rewards/accuracies": 0.75, "rewards/chosen": -2.7221312522888184, "rewards/margins": 1.1207640171051025, "rewards/rejected": -3.8428955078125, "step": 3220 }, { "epoch": 0.21, "grad_norm": 48.75, "learning_rate": 4.8136668279553645e-06, "logits/chosen": -1.2078090906143188, "logits/rejected": -0.79558265209198, "logps/chosen": -533.246337890625, "logps/rejected": -645.1798095703125, "loss": 0.612, "rewards/accuracies": 0.625, "rewards/chosen": -3.2462334632873535, "rewards/margins": 0.954441249370575, "rewards/rejected": -4.200675010681152, "step": 3230 }, { "epoch": 0.21, "grad_norm": 4.09375, "learning_rate": 4.811497558082673e-06, "logits/chosen": -1.2398583889007568, "logits/rejected": -1.1749696731567383, "logps/chosen": -479.38812255859375, "logps/rejected": -569.0161743164062, "loss": 0.6198, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.987818479537964, "rewards/margins": 0.9334991574287415, "rewards/rejected": -3.9213175773620605, "step": 3240 }, { "epoch": 0.21, "grad_norm": 58.75, "learning_rate": 4.809316228552988e-06, "logits/chosen": -1.163702130317688, "logits/rejected": -0.6508394479751587, "logps/chosen": -526.8558349609375, "logps/rejected": -650.5054931640625, "loss": 0.6314, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.5115795135498047, "rewards/margins": 0.9742542505264282, "rewards/rejected": -4.485833168029785, "step": 3250 }, { "epoch": 0.21, "grad_norm": 12.75, "learning_rate": 4.807122850746849e-06, "logits/chosen": -1.2836594581604004, "logits/rejected": -0.5419913530349731, "logps/chosen": -516.7344970703125, "logps/rejected": -644.685546875, "loss": 0.4208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.77213716506958, "rewards/margins": 1.3305270671844482, "rewards/rejected": -4.102664470672607, "step": 3260 }, { "epoch": 0.21, "grad_norm": 15.0, "learning_rate": 4.8049174361076525e-06, "logits/chosen": -1.299584984779358, "logits/rejected": -1.1614048480987549, "logps/chosen": -476.8759765625, "logps/rejected": -575.92041015625, "loss": 0.5515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6399428844451904, "rewards/margins": 0.769615888595581, "rewards/rejected": -3.4095587730407715, "step": 3270 }, { "epoch": 0.21, "grad_norm": 31.0, "learning_rate": 4.802699996141594e-06, "logits/chosen": -1.2031478881835938, "logits/rejected": -1.4828083515167236, "logps/chosen": -413.83245849609375, "logps/rejected": -574.8737182617188, "loss": 0.5575, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.313629627227783, "rewards/margins": 0.97466641664505, "rewards/rejected": -3.2882957458496094, "step": 3280 }, { "epoch": 0.22, "grad_norm": 15.375, "learning_rate": 4.800470542417609e-06, "logits/chosen": -1.6172211170196533, "logits/rejected": -1.4242514371871948, "logps/chosen": -514.2794799804688, "logps/rejected": -578.9103393554688, "loss": 0.5758, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.3101601600646973, "rewards/margins": 0.6258403062820435, "rewards/rejected": -2.936000347137451, "step": 3290 }, { "epoch": 0.22, "grad_norm": 22.25, "learning_rate": 4.798229086567312e-06, "logits/chosen": -1.3142378330230713, "logits/rejected": -1.1479809284210205, "logps/chosen": -530.034423828125, "logps/rejected": -602.07666015625, "loss": 0.5844, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.8024251461029053, "rewards/margins": 0.7338841557502747, "rewards/rejected": -3.5363094806671143, "step": 3300 }, { "epoch": 0.22, "eval_logits/chosen": -1.180967926979065, "eval_logits/rejected": -0.83067387342453, "eval_logps/chosen": -561.1475830078125, "eval_logps/rejected": -653.2316284179688, "eval_loss": 0.5571843385696411, "eval_rewards/accuracies": 0.7139999866485596, "eval_rewards/chosen": -2.965280294418335, "eval_rewards/margins": 1.1213196516036987, "eval_rewards/rejected": -4.086599826812744, "eval_runtime": 1081.7347, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 3300 }, { "epoch": 0.22, "grad_norm": 25.5, "learning_rate": 4.795975640284935e-06, "logits/chosen": -1.1965676546096802, "logits/rejected": -0.9458176493644714, "logps/chosen": -590.3027954101562, "logps/rejected": -660.7537841796875, "loss": 0.6608, "rewards/accuracies": 0.75, "rewards/chosen": -3.3251266479492188, "rewards/margins": 0.7859554290771484, "rewards/rejected": -4.111081600189209, "step": 3310 }, { "epoch": 0.22, "grad_norm": 15.5625, "learning_rate": 4.793710215327266e-06, "logits/chosen": -0.8924918174743652, "logits/rejected": -0.3467273414134979, "logps/chosen": -688.7371215820312, "logps/rejected": -707.7255249023438, "loss": 0.6616, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.143535614013672, "rewards/margins": 0.90118807554245, "rewards/rejected": -5.044723987579346, "step": 3320 }, { "epoch": 0.22, "grad_norm": 30.5, "learning_rate": 4.791432823513591e-06, "logits/chosen": -0.5599774718284607, "logits/rejected": 0.03994857519865036, "logps/chosen": -594.4459228515625, "logps/rejected": -709.951416015625, "loss": 0.4493, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.630066394805908, "rewards/margins": 1.6870349645614624, "rewards/rejected": -5.31710147857666, "step": 3330 }, { "epoch": 0.22, "grad_norm": 7.3125, "learning_rate": 4.789143476725629e-06, "logits/chosen": -0.9170355796813965, "logits/rejected": -0.5607770681381226, "logps/chosen": -723.0828247070312, "logps/rejected": -827.0755615234375, "loss": 0.4634, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.065452575683594, "rewards/margins": 1.408247709274292, "rewards/rejected": -5.473700523376465, "step": 3340 }, { "epoch": 0.22, "grad_norm": 43.0, "learning_rate": 4.786842186907469e-06, "logits/chosen": -0.6802589297294617, "logits/rejected": 0.20150598883628845, "logps/chosen": -768.3973388671875, "logps/rejected": -880.7200317382812, "loss": 0.4276, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.715881824493408, "rewards/margins": 1.830611228942871, "rewards/rejected": -6.546492099761963, "step": 3350 }, { "epoch": 0.22, "grad_norm": 9.25, "learning_rate": 4.784528966065513e-06, "logits/chosen": -0.923387348651886, "logits/rejected": -0.1267358362674713, "logps/chosen": -682.7603759765625, "logps/rejected": -913.0802001953125, "loss": 0.3971, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.013949394226074, "rewards/margins": 2.194063663482666, "rewards/rejected": -6.20801305770874, "step": 3360 }, { "epoch": 0.22, "grad_norm": 11.1875, "learning_rate": 4.782203826268409e-06, "logits/chosen": -0.39992737770080566, "logits/rejected": 0.024773692712187767, "logps/chosen": -882.0910034179688, "logps/rejected": -950.6168212890625, "loss": 0.6733, "rewards/accuracies": 0.75, "rewards/chosen": -5.967153072357178, "rewards/margins": 1.087716817855835, "rewards/rejected": -7.054869651794434, "step": 3370 }, { "epoch": 0.22, "grad_norm": 22.875, "learning_rate": 4.779866779646988e-06, "logits/chosen": -0.6267349123954773, "logits/rejected": 0.09199440479278564, "logps/chosen": -833.3175659179688, "logps/rejected": -952.37109375, "loss": 0.6562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.417940616607666, "rewards/margins": 1.4656034708023071, "rewards/rejected": -6.883543968200684, "step": 3380 }, { "epoch": 0.22, "grad_norm": 27.625, "learning_rate": 4.777517838394204e-06, "logits/chosen": -1.306886911392212, "logits/rejected": 0.2369219809770584, "logps/chosen": -742.55908203125, "logps/rejected": -883.3819580078125, "loss": 0.3887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.719102382659912, "rewards/margins": 1.7817538976669312, "rewards/rejected": -6.500856876373291, "step": 3390 }, { "epoch": 0.22, "grad_norm": 15.75, "learning_rate": 4.775157014765067e-06, "logits/chosen": -1.3060866594314575, "logits/rejected": -0.8634389042854309, "logps/chosen": -657.3032836914062, "logps/rejected": -754.4668579101562, "loss": 0.4916, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5014870166778564, "rewards/margins": 1.3346843719482422, "rewards/rejected": -4.8361711502075195, "step": 3400 }, { "epoch": 0.22, "eval_logits/chosen": -1.2400059700012207, "eval_logits/rejected": -0.9138941764831543, "eval_logps/chosen": -605.480224609375, "eval_logps/rejected": -689.3580322265625, "eval_loss": 0.5625504851341248, "eval_rewards/accuracies": 0.715499997138977, "eval_rewards/chosen": -3.4086062908172607, "eval_rewards/margins": 1.0392574071884155, "eval_rewards/rejected": -4.447863578796387, "eval_runtime": 1081.7381, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 3400 }, { "epoch": 0.22, "grad_norm": 19.375, "learning_rate": 4.772784321076579e-06, "logits/chosen": -1.1132270097732544, "logits/rejected": -0.9977341890335083, "logps/chosen": -536.7339477539062, "logps/rejected": -660.2379150390625, "loss": 0.5087, "rewards/accuracies": 0.75, "rewards/chosen": -2.9782986640930176, "rewards/margins": 1.156237006187439, "rewards/rejected": -4.134535789489746, "step": 3410 }, { "epoch": 0.22, "grad_norm": 10.125, "learning_rate": 4.7703997697076744e-06, "logits/chosen": -1.4152860641479492, "logits/rejected": -1.1574642658233643, "logps/chosen": -548.8434448242188, "logps/rejected": -639.4361572265625, "loss": 0.5728, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7204294204711914, "rewards/margins": 0.8240100741386414, "rewards/rejected": -3.5444393157958984, "step": 3420 }, { "epoch": 0.22, "grad_norm": 13.75, "learning_rate": 4.768003373099148e-06, "logits/chosen": -1.4722901582717896, "logits/rejected": -0.9639554023742676, "logps/chosen": -537.4974365234375, "logps/rejected": -657.3250732421875, "loss": 0.3236, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.492647409439087, "rewards/margins": 1.8174049854278564, "rewards/rejected": -4.310051918029785, "step": 3430 }, { "epoch": 0.23, "grad_norm": 37.0, "learning_rate": 4.765595143753597e-06, "logits/chosen": -1.4009109735488892, "logits/rejected": -1.2545408010482788, "logps/chosen": -574.5474853515625, "logps/rejected": -637.9155883789062, "loss": 0.6424, "rewards/accuracies": 0.625, "rewards/chosen": -3.2242119312286377, "rewards/margins": 1.0137240886688232, "rewards/rejected": -4.237936019897461, "step": 3440 }, { "epoch": 0.23, "grad_norm": 29.75, "learning_rate": 4.763175094235352e-06, "logits/chosen": -1.0382020473480225, "logits/rejected": -0.847082793712616, "logps/chosen": -671.3468627929688, "logps/rejected": -752.9544677734375, "loss": 0.5999, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.89618182182312, "rewards/margins": 1.1922025680541992, "rewards/rejected": -5.088383674621582, "step": 3450 }, { "epoch": 0.23, "grad_norm": 59.25, "learning_rate": 4.760743237170415e-06, "logits/chosen": -1.1639267206192017, "logits/rejected": -0.8316108584403992, "logps/chosen": -810.783203125, "logps/rejected": -777.997802734375, "loss": 0.6755, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.708419322967529, "rewards/margins": 0.8711822628974915, "rewards/rejected": -5.579601287841797, "step": 3460 }, { "epoch": 0.23, "grad_norm": 14.5625, "learning_rate": 4.7582995852463844e-06, "logits/chosen": -0.9919028282165527, "logits/rejected": -0.43444591760635376, "logps/chosen": -771.0494384765625, "logps/rejected": -907.8330078125, "loss": 0.3365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.744234561920166, "rewards/margins": 1.6410068273544312, "rewards/rejected": -6.385241508483887, "step": 3470 }, { "epoch": 0.23, "grad_norm": 14.0625, "learning_rate": 4.755844151212401e-06, "logits/chosen": -0.35371965169906616, "logits/rejected": -0.004077828023582697, "logps/chosen": -697.5130615234375, "logps/rejected": -819.2736206054688, "loss": 0.4937, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.6213698387146, "rewards/margins": 1.222683072090149, "rewards/rejected": -5.844052791595459, "step": 3480 }, { "epoch": 0.23, "grad_norm": 28.75, "learning_rate": 4.753376947879076e-06, "logits/chosen": -0.620795488357544, "logits/rejected": -0.3216823935508728, "logps/chosen": -728.139404296875, "logps/rejected": -871.5015869140625, "loss": 0.6007, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.507253170013428, "rewards/margins": 1.4663857221603394, "rewards/rejected": -5.973639011383057, "step": 3490 }, { "epoch": 0.23, "grad_norm": 25.375, "learning_rate": 4.750897988118419e-06, "logits/chosen": -0.43524184823036194, "logits/rejected": -0.9073172807693481, "logps/chosen": -736.4920654296875, "logps/rejected": -852.0842895507812, "loss": 0.5492, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.459339141845703, "rewards/margins": 1.2956950664520264, "rewards/rejected": -5.75503396987915, "step": 3500 }, { "epoch": 0.23, "eval_logits/chosen": -0.7194792628288269, "eval_logits/rejected": -0.2621947228908539, "eval_logps/chosen": -723.8026733398438, "eval_logps/rejected": -820.3834228515625, "eval_loss": 0.5705658793449402, "eval_rewards/accuracies": 0.7239999771118164, "eval_rewards/chosen": -4.591831207275391, "eval_rewards/margins": 1.166286587715149, "eval_rewards/rejected": -5.75811767578125, "eval_runtime": 1082.0156, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 3500 }, { "epoch": 0.23, "grad_norm": 62.0, "learning_rate": 4.74840728486378e-06, "logits/chosen": -0.8491600155830383, "logits/rejected": 0.10555891692638397, "logps/chosen": -688.68408203125, "logps/rejected": -697.4490966796875, "loss": 0.6864, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.497300148010254, "rewards/margins": 0.5967486500740051, "rewards/rejected": -5.094048500061035, "step": 3510 }, { "epoch": 0.23, "grad_norm": 41.5, "learning_rate": 4.745904851109779e-06, "logits/chosen": -0.8030447959899902, "logits/rejected": -0.31818851828575134, "logps/chosen": -589.0924682617188, "logps/rejected": -777.6842651367188, "loss": 0.5016, "rewards/accuracies": 0.75, "rewards/chosen": -3.6406517028808594, "rewards/margins": 1.496906042098999, "rewards/rejected": -5.137557506561279, "step": 3520 }, { "epoch": 0.23, "grad_norm": 27.5, "learning_rate": 4.743390699912232e-06, "logits/chosen": -0.6764593124389648, "logits/rejected": -0.6873399019241333, "logps/chosen": -714.80517578125, "logps/rejected": -782.1397705078125, "loss": 0.5709, "rewards/accuracies": 0.625, "rewards/chosen": -4.314342498779297, "rewards/margins": 1.039636254310608, "rewards/rejected": -5.353978633880615, "step": 3530 }, { "epoch": 0.23, "grad_norm": 5.1875, "learning_rate": 4.74086484438809e-06, "logits/chosen": -1.04461669921875, "logits/rejected": -1.3528707027435303, "logps/chosen": -614.5618896484375, "logps/rejected": -688.8928833007812, "loss": 0.5633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9445931911468506, "rewards/margins": 0.886997401714325, "rewards/rejected": -3.8315906524658203, "step": 3540 }, { "epoch": 0.23, "grad_norm": 12.875, "learning_rate": 4.73832729771537e-06, "logits/chosen": -1.6609852313995361, "logits/rejected": -1.428131103515625, "logps/chosen": -615.2433471679688, "logps/rejected": -653.7088623046875, "loss": 0.5765, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.828134059906006, "rewards/margins": 0.8854513168334961, "rewards/rejected": -3.713585376739502, "step": 3550 }, { "epoch": 0.23, "grad_norm": 14.75, "learning_rate": 4.735778073133084e-06, "logits/chosen": -1.4100815057754517, "logits/rejected": -1.004941701889038, "logps/chosen": -457.32598876953125, "logps/rejected": -522.54931640625, "loss": 0.7082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.8064913749694824, "rewards/margins": 0.4415194094181061, "rewards/rejected": -3.2480106353759766, "step": 3560 }, { "epoch": 0.23, "grad_norm": 12.4375, "learning_rate": 4.733217183941169e-06, "logits/chosen": -1.4216357469558716, "logits/rejected": -1.3008767366409302, "logps/chosen": -603.7659912109375, "logps/rejected": -706.1392822265625, "loss": 0.4258, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.197375774383545, "rewards/margins": 1.1750842332839966, "rewards/rejected": -4.37246036529541, "step": 3570 }, { "epoch": 0.23, "grad_norm": 8.125, "learning_rate": 4.73064464350042e-06, "logits/chosen": -1.2225351333618164, "logits/rejected": -1.3869209289550781, "logps/chosen": -620.8990478515625, "logps/rejected": -759.6568603515625, "loss": 0.7508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.794354200363159, "rewards/margins": 0.7074971199035645, "rewards/rejected": -4.5018510818481445, "step": 3580 }, { "epoch": 0.23, "grad_norm": 21.25, "learning_rate": 4.728060465232418e-06, "logits/chosen": -0.7605262398719788, "logits/rejected": -0.5269777178764343, "logps/chosen": -560.8189697265625, "logps/rejected": -716.3929443359375, "loss": 0.4487, "rewards/accuracies": 0.75, "rewards/chosen": -3.3244705200195312, "rewards/margins": 1.7012325525283813, "rewards/rejected": -5.025703430175781, "step": 3590 }, { "epoch": 0.24, "grad_norm": 6.25, "learning_rate": 4.725464662619466e-06, "logits/chosen": -0.7623016238212585, "logits/rejected": -0.5146565437316895, "logps/chosen": -660.55810546875, "logps/rejected": -848.2222900390625, "loss": 0.4557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.126250743865967, "rewards/margins": 1.7908127307891846, "rewards/rejected": -5.917063236236572, "step": 3600 }, { "epoch": 0.24, "eval_logits/chosen": -0.7418410778045654, "eval_logits/rejected": -0.25624728202819824, "eval_logps/chosen": -766.2864990234375, "eval_logps/rejected": -873.8727416992188, "eval_loss": 0.5934799313545227, "eval_rewards/accuracies": 0.7045000195503235, "eval_rewards/chosen": -5.016669273376465, "eval_rewards/margins": 1.276341199874878, "eval_rewards/rejected": -6.293010711669922, "eval_runtime": 1081.9075, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 3600 }, { "epoch": 0.24, "grad_norm": 26.625, "learning_rate": 4.72285724920451e-06, "logits/chosen": -0.49695903062820435, "logits/rejected": 0.15499183535575867, "logps/chosen": -818.9177856445312, "logps/rejected": -878.5466918945312, "loss": 0.6203, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.513499736785889, "rewards/margins": 1.0668903589248657, "rewards/rejected": -6.580389976501465, "step": 3610 }, { "epoch": 0.24, "grad_norm": 15.25, "learning_rate": 4.7202382385910724e-06, "logits/chosen": -0.9239240884780884, "logits/rejected": -0.5548107028007507, "logps/chosen": -815.7239379882812, "logps/rejected": -889.0584106445312, "loss": 0.7026, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.303755760192871, "rewards/margins": 0.9647020101547241, "rewards/rejected": -6.268457889556885, "step": 3620 }, { "epoch": 0.24, "grad_norm": 12.8125, "learning_rate": 4.717607644443184e-06, "logits/chosen": -0.30479803681373596, "logits/rejected": 0.17707741260528564, "logps/chosen": -760.4248046875, "logps/rejected": -908.0985107421875, "loss": 0.4673, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.053077220916748, "rewards/margins": 1.4415043592453003, "rewards/rejected": -6.494582176208496, "step": 3630 }, { "epoch": 0.24, "grad_norm": 23.5, "learning_rate": 4.714965480485307e-06, "logits/chosen": -0.6719765663146973, "logits/rejected": -0.41625890135765076, "logps/chosen": -759.8556518554688, "logps/rejected": -780.1153564453125, "loss": 0.9513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5.481524467468262, "rewards/margins": 0.33042654395103455, "rewards/rejected": -5.811950206756592, "step": 3640 }, { "epoch": 0.24, "grad_norm": 31.125, "learning_rate": 4.712311760502267e-06, "logits/chosen": -0.3793136477470398, "logits/rejected": -0.849342942237854, "logps/chosen": -655.8671875, "logps/rejected": -816.8701782226562, "loss": 0.5335, "rewards/accuracies": 0.75, "rewards/chosen": -4.28458309173584, "rewards/margins": 1.33091139793396, "rewards/rejected": -5.6154937744140625, "step": 3650 }, { "epoch": 0.24, "grad_norm": 47.75, "learning_rate": 4.709646498339181e-06, "logits/chosen": -0.9343104362487793, "logits/rejected": -0.3511790335178375, "logps/chosen": -588.104736328125, "logps/rejected": -652.6898193359375, "loss": 0.673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.492342472076416, "rewards/margins": 1.0331475734710693, "rewards/rejected": -4.525489807128906, "step": 3660 }, { "epoch": 0.24, "grad_norm": 10.375, "learning_rate": 4.706969707901383e-06, "logits/chosen": -1.353529691696167, "logits/rejected": -1.007610559463501, "logps/chosen": -563.6948852539062, "logps/rejected": -660.8024291992188, "loss": 0.6671, "rewards/accuracies": 0.625, "rewards/chosen": -3.0551259517669678, "rewards/margins": 0.8183481097221375, "rewards/rejected": -3.87347412109375, "step": 3670 }, { "epoch": 0.24, "grad_norm": 13.125, "learning_rate": 4.704281403154355e-06, "logits/chosen": -1.514502763748169, "logits/rejected": -1.004990816116333, "logps/chosen": -623.9945068359375, "logps/rejected": -625.28369140625, "loss": 0.5455, "rewards/accuracies": 0.75, "rewards/chosen": -3.189518451690674, "rewards/margins": 0.6841517686843872, "rewards/rejected": -3.8736701011657715, "step": 3680 }, { "epoch": 0.24, "grad_norm": 16.875, "learning_rate": 4.701581598123649e-06, "logits/chosen": -1.3561251163482666, "logits/rejected": -0.9785367846488953, "logps/chosen": -571.4412841796875, "logps/rejected": -644.2615966796875, "loss": 0.5045, "rewards/accuracies": 0.75, "rewards/chosen": -3.1041922569274902, "rewards/margins": 0.7187532186508179, "rewards/rejected": -3.8229458332061768, "step": 3690 }, { "epoch": 0.24, "grad_norm": 7.46875, "learning_rate": 4.69887030689482e-06, "logits/chosen": -1.0122063159942627, "logits/rejected": -0.6336378455162048, "logps/chosen": -562.7955932617188, "logps/rejected": -593.3282470703125, "loss": 0.526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1311886310577393, "rewards/margins": 0.8227275609970093, "rewards/rejected": -3.953916549682617, "step": 3700 }, { "epoch": 0.24, "eval_logits/chosen": -1.1784464120864868, "eval_logits/rejected": -0.7844770550727844, "eval_logps/chosen": -565.1747436523438, "eval_logps/rejected": -638.843505859375, "eval_loss": 0.5307280421257019, "eval_rewards/accuracies": 0.7204999923706055, "eval_rewards/chosen": -3.005551338195801, "eval_rewards/margins": 0.937167227268219, "eval_rewards/rejected": -3.942718267440796, "eval_runtime": 1081.7843, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 3700 }, { "epoch": 0.24, "grad_norm": 12.375, "learning_rate": 4.696147543613347e-06, "logits/chosen": -1.1483800411224365, "logits/rejected": -0.6617498397827148, "logps/chosen": -523.7181396484375, "logps/rejected": -648.9072265625, "loss": 0.4185, "rewards/accuracies": 0.75, "rewards/chosen": -2.77998685836792, "rewards/margins": 1.386183261871338, "rewards/rejected": -4.166170597076416, "step": 3710 }, { "epoch": 0.24, "grad_norm": 24.625, "learning_rate": 4.693413322484562e-06, "logits/chosen": -0.9697479009628296, "logits/rejected": -0.17288948595523834, "logps/chosen": -580.2626953125, "logps/rejected": -697.6012573242188, "loss": 0.5349, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.761812210083008, "rewards/margins": 0.9746885299682617, "rewards/rejected": -4.7365007400512695, "step": 3720 }, { "epoch": 0.24, "grad_norm": 9.75, "learning_rate": 4.690667657773576e-06, "logits/chosen": -0.9481481313705444, "logits/rejected": 0.21047723293304443, "logps/chosen": -665.93017578125, "logps/rejected": -742.4630737304688, "loss": 0.4461, "rewards/accuracies": 0.75, "rewards/chosen": -3.8742809295654297, "rewards/margins": 1.2509515285491943, "rewards/rejected": -5.125232219696045, "step": 3730 }, { "epoch": 0.24, "grad_norm": 16.125, "learning_rate": 4.687910563805206e-06, "logits/chosen": -0.7471721768379211, "logits/rejected": -0.06876786053180695, "logps/chosen": -608.1845092773438, "logps/rejected": -687.7070922851562, "loss": 0.5595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.8002452850341797, "rewards/margins": 0.842210590839386, "rewards/rejected": -4.642455577850342, "step": 3740 }, { "epoch": 0.25, "grad_norm": 17.375, "learning_rate": 4.685142054963895e-06, "logits/chosen": -0.749923825263977, "logits/rejected": -0.0942760556936264, "logps/chosen": -646.336669921875, "logps/rejected": -798.2391967773438, "loss": 0.4511, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.924957275390625, "rewards/margins": 1.5838274955749512, "rewards/rejected": -5.508784770965576, "step": 3750 }, { "epoch": 0.25, "grad_norm": 11.5, "learning_rate": 4.68236214569364e-06, "logits/chosen": -1.0639054775238037, "logits/rejected": 0.1586601585149765, "logps/chosen": -635.2181396484375, "logps/rejected": -692.99609375, "loss": 0.4083, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4633872509002686, "rewards/margins": 1.3196017742156982, "rewards/rejected": -4.782989025115967, "step": 3760 }, { "epoch": 0.25, "grad_norm": 19.125, "learning_rate": 4.67957085049792e-06, "logits/chosen": -0.595992922782898, "logits/rejected": -0.5118898153305054, "logps/chosen": -650.3112182617188, "logps/rejected": -853.2965698242188, "loss": 0.6606, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8930211067199707, "rewards/margins": 1.4812119007110596, "rewards/rejected": -5.374232292175293, "step": 3770 }, { "epoch": 0.25, "grad_norm": 51.5, "learning_rate": 4.676768183939614e-06, "logits/chosen": -1.0965793132781982, "logits/rejected": -1.0568536520004272, "logps/chosen": -578.0953979492188, "logps/rejected": -659.5288696289062, "loss": 0.6467, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.147719621658325, "rewards/margins": 0.9447818994522095, "rewards/rejected": -4.092501640319824, "step": 3780 }, { "epoch": 0.25, "grad_norm": 15.125, "learning_rate": 4.673954160640931e-06, "logits/chosen": -1.6548932790756226, "logits/rejected": -1.2120978832244873, "logps/chosen": -520.5531005859375, "logps/rejected": -571.0289306640625, "loss": 0.6295, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.3512320518493652, "rewards/margins": 0.46643513441085815, "rewards/rejected": -2.817667007446289, "step": 3790 }, { "epoch": 0.25, "grad_norm": 8.9375, "learning_rate": 4.671128795283329e-06, "logits/chosen": -1.5643247365951538, "logits/rejected": -1.3110089302062988, "logps/chosen": -473.55194091796875, "logps/rejected": -510.16046142578125, "loss": 0.5895, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7292392253875732, "rewards/margins": 0.7332664728164673, "rewards/rejected": -2.462505578994751, "step": 3800 }, { "epoch": 0.25, "eval_logits/chosen": -1.6296287775039673, "eval_logits/rejected": -1.4099284410476685, "eval_logps/chosen": -422.7353515625, "eval_logps/rejected": -474.79486083984375, "eval_loss": 0.5400600433349609, "eval_rewards/accuracies": 0.7160000205039978, "eval_rewards/chosen": -1.5811570882797241, "eval_rewards/margins": 0.7210747003555298, "eval_rewards/rejected": -2.302231788635254, "eval_runtime": 1082.8511, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 3800 }, { "epoch": 0.25, "grad_norm": 23.0, "learning_rate": 4.6682921026074406e-06, "logits/chosen": -1.9362761974334717, "logits/rejected": -1.7173035144805908, "logps/chosen": -469.5625, "logps/rejected": -520.7311401367188, "loss": 0.6036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.731869101524353, "rewards/margins": 0.65420001745224, "rewards/rejected": -2.386068820953369, "step": 3810 }, { "epoch": 0.25, "grad_norm": 44.5, "learning_rate": 4.665444097412997e-06, "logits/chosen": -1.491999864578247, "logits/rejected": -0.9012607336044312, "logps/chosen": -445.4512634277344, "logps/rejected": -492.11212158203125, "loss": 0.4603, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8774988651275635, "rewards/margins": 0.9159401655197144, "rewards/rejected": -2.7934391498565674, "step": 3820 }, { "epoch": 0.25, "grad_norm": 27.375, "learning_rate": 4.662584794558748e-06, "logits/chosen": -1.3159914016723633, "logits/rejected": -1.254296898841858, "logps/chosen": -582.7546997070312, "logps/rejected": -645.6920166015625, "loss": 0.6057, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.5350306034088135, "rewards/margins": 0.8374298214912415, "rewards/rejected": -3.3724606037139893, "step": 3830 }, { "epoch": 0.25, "grad_norm": 16.625, "learning_rate": 4.659714208962387e-06, "logits/chosen": -1.2530070543289185, "logits/rejected": 0.10765887796878815, "logps/chosen": -552.3135986328125, "logps/rejected": -636.8727416992188, "loss": 0.4636, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9280037879943848, "rewards/margins": 1.3956817388534546, "rewards/rejected": -4.3236846923828125, "step": 3840 }, { "epoch": 0.25, "grad_norm": 20.5, "learning_rate": 4.656832355600473e-06, "logits/chosen": -0.2815227210521698, "logits/rejected": -0.1625792682170868, "logps/chosen": -549.5087890625, "logps/rejected": -689.497802734375, "loss": 0.6395, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7235801219940186, "rewards/margins": 0.9216213226318359, "rewards/rejected": -4.645201683044434, "step": 3850 }, { "epoch": 0.25, "grad_norm": 13.375, "learning_rate": 4.653939249508351e-06, "logits/chosen": -0.656479001045227, "logits/rejected": -0.5062856078147888, "logps/chosen": -521.91357421875, "logps/rejected": -611.4381103515625, "loss": 0.5556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.1964187622070312, "rewards/margins": 0.7598564028739929, "rewards/rejected": -3.956275224685669, "step": 3860 }, { "epoch": 0.25, "grad_norm": 15.125, "learning_rate": 4.651034905780074e-06, "logits/chosen": -0.6921730637550354, "logits/rejected": 0.1719253808259964, "logps/chosen": -569.46337890625, "logps/rejected": -613.8056030273438, "loss": 0.5528, "rewards/accuracies": 0.75, "rewards/chosen": -3.070962429046631, "rewards/margins": 1.176013708114624, "rewards/rejected": -4.246975898742676, "step": 3870 }, { "epoch": 0.25, "grad_norm": 8.5625, "learning_rate": 4.648119339568323e-06, "logits/chosen": -0.8734556436538696, "logits/rejected": -0.39816445112228394, "logps/chosen": -613.6783447265625, "logps/rejected": -683.02978515625, "loss": 0.5368, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.607755661010742, "rewards/margins": 0.6842793822288513, "rewards/rejected": -4.292035102844238, "step": 3880 }, { "epoch": 0.25, "grad_norm": 20.625, "learning_rate": 4.6451925660843335e-06, "logits/chosen": -0.4839141368865967, "logits/rejected": -0.27698439359664917, "logps/chosen": -592.3812255859375, "logps/rejected": -619.481689453125, "loss": 0.5935, "rewards/accuracies": 0.75, "rewards/chosen": -3.4017772674560547, "rewards/margins": 0.7965691089630127, "rewards/rejected": -4.198346138000488, "step": 3890 }, { "epoch": 0.26, "grad_norm": 19.875, "learning_rate": 4.642254600597809e-06, "logits/chosen": -0.45939701795578003, "logits/rejected": -0.8174542188644409, "logps/chosen": -670.4529418945312, "logps/rejected": -755.3880004882812, "loss": 0.7091, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.007084369659424, "rewards/margins": 0.4814371168613434, "rewards/rejected": -4.488521099090576, "step": 3900 }, { "epoch": 0.26, "eval_logits/chosen": -0.6684787273406982, "eval_logits/rejected": -0.19569410383701324, "eval_logps/chosen": -642.5602416992188, "eval_logps/rejected": -733.0519409179688, "eval_loss": 0.5537856221199036, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": -3.779406785964966, "eval_rewards/margins": 1.105396032333374, "eval_rewards/rejected": -4.884802341461182, "eval_runtime": 1083.1173, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 3900 }, { "epoch": 0.26, "grad_norm": 118.5, "learning_rate": 4.639305458436844e-06, "logits/chosen": -0.5646910667419434, "logits/rejected": -0.6691771745681763, "logps/chosen": -657.239501953125, "logps/rejected": -736.2274169921875, "loss": 0.6397, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.880657911300659, "rewards/margins": 0.8775174021720886, "rewards/rejected": -4.758175849914551, "step": 3910 }, { "epoch": 0.26, "grad_norm": 30.875, "learning_rate": 4.636345154987849e-06, "logits/chosen": -0.6892023086547852, "logits/rejected": -0.08253850787878036, "logps/chosen": -690.7490234375, "logps/rejected": -754.5938720703125, "loss": 0.5617, "rewards/accuracies": 0.625, "rewards/chosen": -4.0598464012146, "rewards/margins": 0.9781166315078735, "rewards/rejected": -5.037962913513184, "step": 3920 }, { "epoch": 0.26, "grad_norm": 18.75, "learning_rate": 4.633373705695459e-06, "logits/chosen": -0.5748471021652222, "logits/rejected": -0.11621642112731934, "logps/chosen": -793.6655883789062, "logps/rejected": -875.13818359375, "loss": 0.4634, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.411246299743652, "rewards/margins": 1.4211734533309937, "rewards/rejected": -5.832419395446777, "step": 3930 }, { "epoch": 0.26, "grad_norm": 7.5, "learning_rate": 4.630391126062465e-06, "logits/chosen": -0.52989661693573, "logits/rejected": 0.43578729033470154, "logps/chosen": -720.1056518554688, "logps/rejected": -777.4183349609375, "loss": 0.6325, "rewards/accuracies": 0.625, "rewards/chosen": -4.163119792938232, "rewards/margins": 1.2837560176849365, "rewards/rejected": -5.44687557220459, "step": 3940 }, { "epoch": 0.26, "grad_norm": 40.25, "learning_rate": 4.627397431649726e-06, "logits/chosen": -1.2093026638031006, "logits/rejected": -0.43598490953445435, "logps/chosen": -600.8981323242188, "logps/rejected": -614.5740356445312, "loss": 0.7177, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.1458818912506104, "rewards/margins": 0.8416060209274292, "rewards/rejected": -3.987487316131592, "step": 3950 }, { "epoch": 0.26, "grad_norm": 11.75, "learning_rate": 4.624392638076088e-06, "logits/chosen": -1.203223705291748, "logits/rejected": -0.70448899269104, "logps/chosen": -446.884033203125, "logps/rejected": -515.8419189453125, "loss": 0.5607, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.650068998336792, "rewards/margins": 0.8592624664306641, "rewards/rejected": -3.509331464767456, "step": 3960 }, { "epoch": 0.26, "grad_norm": 25.875, "learning_rate": 4.621376761018308e-06, "logits/chosen": -1.1869691610336304, "logits/rejected": -0.9943572282791138, "logps/chosen": -468.0663146972656, "logps/rejected": -540.9650268554688, "loss": 0.8293, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4877028465270996, "rewards/margins": 0.7591641545295715, "rewards/rejected": -3.2468669414520264, "step": 3970 }, { "epoch": 0.26, "grad_norm": 24.5, "learning_rate": 4.6183498162109635e-06, "logits/chosen": -1.099056363105774, "logits/rejected": -1.1974191665649414, "logps/chosen": -404.2382507324219, "logps/rejected": -453.99237060546875, "loss": 0.8145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.174851894378662, "rewards/margins": 0.44647178053855896, "rewards/rejected": -2.621323347091675, "step": 3980 }, { "epoch": 0.26, "grad_norm": 10.75, "learning_rate": 4.615311819446379e-06, "logits/chosen": -1.6770555973052979, "logits/rejected": -1.1778085231781006, "logps/chosen": -489.46875, "logps/rejected": -521.0170288085938, "loss": 0.5504, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5480003356933594, "rewards/margins": 0.9889028668403625, "rewards/rejected": -2.536902904510498, "step": 3990 }, { "epoch": 0.26, "grad_norm": 19.25, "learning_rate": 4.6122627865745376e-06, "logits/chosen": -1.8001086711883545, "logits/rejected": -1.1110185384750366, "logps/chosen": -425.32257080078125, "logps/rejected": -492.922607421875, "loss": 0.504, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.648384690284729, "rewards/margins": 0.8934825658798218, "rewards/rejected": -2.541867256164551, "step": 4000 }, { "epoch": 0.26, "eval_logits/chosen": -1.6914951801300049, "eval_logits/rejected": -1.412641167640686, "eval_logps/chosen": -418.7832946777344, "eval_logps/rejected": -483.5218811035156, "eval_loss": 0.5234020352363586, "eval_rewards/accuracies": 0.7365000247955322, "eval_rewards/chosen": -1.5416367053985596, "eval_rewards/margins": 0.8478653430938721, "eval_rewards/rejected": -2.3895020484924316, "eval_runtime": 1081.8657, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 4000 }, { "epoch": 0.26, "grad_norm": 12.5, "learning_rate": 4.609202733503001e-06, "logits/chosen": -1.8345247507095337, "logits/rejected": -0.8747278451919556, "logps/chosen": -417.81494140625, "logps/rejected": -465.59613037109375, "loss": 0.4002, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4220476150512695, "rewards/margins": 1.1609103679656982, "rewards/rejected": -2.582958221435547, "step": 4010 }, { "epoch": 0.26, "grad_norm": 15.1875, "learning_rate": 4.606131676196827e-06, "logits/chosen": -1.6170154809951782, "logits/rejected": -1.663805603981018, "logps/chosen": -442.58282470703125, "logps/rejected": -506.9329528808594, "loss": 0.5167, "rewards/accuracies": 0.75, "rewards/chosen": -1.8274881839752197, "rewards/margins": 0.7952731847763062, "rewards/rejected": -2.6227617263793945, "step": 4020 }, { "epoch": 0.26, "grad_norm": 9.5, "learning_rate": 4.603049630678483e-06, "logits/chosen": -1.6386053562164307, "logits/rejected": -1.1676441431045532, "logps/chosen": -523.4532470703125, "logps/rejected": -641.543701171875, "loss": 0.4736, "rewards/accuracies": 0.75, "rewards/chosen": -2.188727617263794, "rewards/margins": 1.0606520175933838, "rewards/rejected": -3.2493796348571777, "step": 4030 }, { "epoch": 0.26, "grad_norm": 22.625, "learning_rate": 4.599956613027769e-06, "logits/chosen": -1.1470407247543335, "logits/rejected": -1.126011610031128, "logps/chosen": -510.09722900390625, "logps/rejected": -595.1661987304688, "loss": 0.6354, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7714762687683105, "rewards/margins": 0.9859358668327332, "rewards/rejected": -3.7574124336242676, "step": 4040 }, { "epoch": 0.26, "grad_norm": 28.25, "learning_rate": 4.596852639381724e-06, "logits/chosen": -0.9361361265182495, "logits/rejected": -1.0501594543457031, "logps/chosen": -578.3109741210938, "logps/rejected": -623.2984619140625, "loss": 0.9154, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.48213267326355, "rewards/margins": 0.25711530447006226, "rewards/rejected": -3.739248275756836, "step": 4050 }, { "epoch": 0.27, "grad_norm": 20.0, "learning_rate": 4.59373772593455e-06, "logits/chosen": -1.6486784219741821, "logits/rejected": -0.7641524076461792, "logps/chosen": -602.9207153320312, "logps/rejected": -650.1547241210938, "loss": 0.3753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.450540542602539, "rewards/margins": 1.471935510635376, "rewards/rejected": -3.922475814819336, "step": 4060 }, { "epoch": 0.27, "grad_norm": 19.25, "learning_rate": 4.590611888937525e-06, "logits/chosen": -1.1161381006240845, "logits/rejected": -0.9251816868782043, "logps/chosen": -523.0493774414062, "logps/rejected": -644.383544921875, "loss": 0.4299, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5072925090789795, "rewards/margins": 1.3106341361999512, "rewards/rejected": -3.8179264068603516, "step": 4070 }, { "epoch": 0.27, "grad_norm": 5.0, "learning_rate": 4.5874751446989175e-06, "logits/chosen": -1.1681926250457764, "logits/rejected": -0.5067660212516785, "logps/chosen": -477.80462646484375, "logps/rejected": -640.333740234375, "loss": 0.453, "rewards/accuracies": 0.75, "rewards/chosen": -2.442675828933716, "rewards/margins": 1.8029371500015259, "rewards/rejected": -4.245613098144531, "step": 4080 }, { "epoch": 0.27, "grad_norm": 7.65625, "learning_rate": 4.5843275095839005e-06, "logits/chosen": -1.4732810258865356, "logits/rejected": -0.6083430051803589, "logps/chosen": -559.0816650390625, "logps/rejected": -641.4822998046875, "loss": 0.5892, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7278664112091064, "rewards/margins": 1.3080952167510986, "rewards/rejected": -4.035961627960205, "step": 4090 }, { "epoch": 0.27, "grad_norm": 15.1875, "learning_rate": 4.581169000014467e-06, "logits/chosen": -1.0545494556427002, "logits/rejected": -0.8168609738349915, "logps/chosen": -575.4486083984375, "logps/rejected": -708.5343627929688, "loss": 0.571, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.8327815532684326, "rewards/margins": 1.2390844821929932, "rewards/rejected": -4.071866035461426, "step": 4100 }, { "epoch": 0.27, "eval_logits/chosen": -1.0519169569015503, "eval_logits/rejected": -0.6805212497711182, "eval_logps/chosen": -571.6519775390625, "eval_logps/rejected": -674.2472534179688, "eval_loss": 0.5638321042060852, "eval_rewards/accuracies": 0.7254999876022339, "eval_rewards/chosen": -3.0703237056732178, "eval_rewards/margins": 1.2264329195022583, "eval_rewards/rejected": -4.296756267547607, "eval_runtime": 1081.8003, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 4100 }, { "epoch": 0.27, "grad_norm": 10.8125, "learning_rate": 4.577999632469349e-06, "logits/chosen": -0.7476164102554321, "logits/rejected": -1.0006214380264282, "logps/chosen": -520.135498046875, "logps/rejected": -685.3666381835938, "loss": 0.4509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.727369785308838, "rewards/margins": 1.612780213356018, "rewards/rejected": -4.340150356292725, "step": 4110 }, { "epoch": 0.27, "grad_norm": 23.0, "learning_rate": 4.574819423483923e-06, "logits/chosen": -0.9771356582641602, "logits/rejected": -0.2744729518890381, "logps/chosen": -508.83795166015625, "logps/rejected": -648.5526123046875, "loss": 0.4298, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8278720378875732, "rewards/margins": 1.5730832815170288, "rewards/rejected": -4.4009552001953125, "step": 4120 }, { "epoch": 0.27, "grad_norm": 6.78125, "learning_rate": 4.571628389650129e-06, "logits/chosen": -1.3481627702713013, "logits/rejected": -0.6164405345916748, "logps/chosen": -572.0777587890625, "logps/rejected": -595.2544555664062, "loss": 0.5014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4666671752929688, "rewards/margins": 1.2540901899337769, "rewards/rejected": -3.720757246017456, "step": 4130 }, { "epoch": 0.27, "grad_norm": 24.0, "learning_rate": 4.568426547616383e-06, "logits/chosen": -0.6733852624893188, "logits/rejected": -1.1666316986083984, "logps/chosen": -494.1783142089844, "logps/rejected": -722.7658081054688, "loss": 0.4672, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.575439929962158, "rewards/margins": 1.6176258325576782, "rewards/rejected": -4.193065643310547, "step": 4140 }, { "epoch": 0.27, "grad_norm": 29.75, "learning_rate": 4.565213914087491e-06, "logits/chosen": -1.145452857017517, "logits/rejected": -0.39761584997177124, "logps/chosen": -544.7350463867188, "logps/rejected": -626.0626220703125, "loss": 0.586, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.786475658416748, "rewards/margins": 0.9719964861869812, "rewards/rejected": -3.758471965789795, "step": 4150 }, { "epoch": 0.27, "grad_norm": 71.5, "learning_rate": 4.561990505824561e-06, "logits/chosen": -0.6237131357192993, "logits/rejected": -0.3988645672798157, "logps/chosen": -554.324951171875, "logps/rejected": -676.8182373046875, "loss": 0.5552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.6013004779815674, "rewards/margins": 1.313946008682251, "rewards/rejected": -3.9152462482452393, "step": 4160 }, { "epoch": 0.27, "grad_norm": 50.0, "learning_rate": 4.558756339644913e-06, "logits/chosen": -1.2438318729400635, "logits/rejected": -0.693548321723938, "logps/chosen": -479.8941345214844, "logps/rejected": -611.3884887695312, "loss": 0.4949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.463209390640259, "rewards/margins": 1.350968599319458, "rewards/rejected": -3.8141777515411377, "step": 4170 }, { "epoch": 0.27, "grad_norm": 13.4375, "learning_rate": 4.555511432421996e-06, "logits/chosen": -1.505509853363037, "logits/rejected": -1.1256517171859741, "logps/chosen": -573.950927734375, "logps/rejected": -648.921142578125, "loss": 0.5052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4586167335510254, "rewards/margins": 1.0695935487747192, "rewards/rejected": -3.528210163116455, "step": 4180 }, { "epoch": 0.27, "grad_norm": 21.5, "learning_rate": 4.552255801085298e-06, "logits/chosen": -0.7402632236480713, "logits/rejected": -1.029975414276123, "logps/chosen": -564.2478637695312, "logps/rejected": -713.4837036132812, "loss": 0.4851, "rewards/accuracies": 0.75, "rewards/chosen": -2.903712749481201, "rewards/margins": 1.4144295454025269, "rewards/rejected": -4.318142414093018, "step": 4190 }, { "epoch": 0.27, "grad_norm": 11.5, "learning_rate": 4.548989462620254e-06, "logits/chosen": -0.803270161151886, "logits/rejected": -0.8908348083496094, "logps/chosen": -503.0487365722656, "logps/rejected": -665.79248046875, "loss": 0.5907, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.876936435699463, "rewards/margins": 0.909934401512146, "rewards/rejected": -3.7868709564208984, "step": 4200 }, { "epoch": 0.27, "eval_logits/chosen": -0.8569242358207703, "eval_logits/rejected": -0.44857415556907654, "eval_logps/chosen": -545.9052734375, "eval_logps/rejected": -647.9713745117188, "eval_loss": 0.5568973422050476, "eval_rewards/accuracies": 0.7139999866485596, "eval_rewards/chosen": -2.812856912612915, "eval_rewards/margins": 1.2211408615112305, "eval_rewards/rejected": -4.033997535705566, "eval_runtime": 1081.7902, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 4200 }, { "epoch": 0.28, "grad_norm": 17.25, "learning_rate": 4.545712434068167e-06, "logits/chosen": -0.49404245615005493, "logits/rejected": -0.9404687881469727, "logps/chosen": -506.9319763183594, "logps/rejected": -656.5183715820312, "loss": 0.4279, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.637255907058716, "rewards/margins": 1.4078023433685303, "rewards/rejected": -4.045057773590088, "step": 4210 }, { "epoch": 0.28, "grad_norm": 64.5, "learning_rate": 4.542424732526105e-06, "logits/chosen": -0.4075062870979309, "logits/rejected": -0.11586128175258636, "logps/chosen": -594.1463623046875, "logps/rejected": -943.2869262695312, "loss": 0.2778, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5571396350860596, "rewards/margins": 3.325035572052002, "rewards/rejected": -6.882175445556641, "step": 4220 }, { "epoch": 0.28, "grad_norm": 173.0, "learning_rate": 4.539126375146827e-06, "logits/chosen": -0.21807250380516052, "logits/rejected": 0.5543156862258911, "logps/chosen": -792.2327880859375, "logps/rejected": -931.7062377929688, "loss": 0.8102, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.748785495758057, "rewards/margins": 2.194566249847412, "rewards/rejected": -6.943351745605469, "step": 4230 }, { "epoch": 0.28, "grad_norm": 11.9375, "learning_rate": 4.535817379138681e-06, "logits/chosen": -0.3225018382072449, "logits/rejected": 0.5527623891830444, "logps/chosen": -655.910888671875, "logps/rejected": -703.3526611328125, "loss": 0.9787, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.0138654708862305, "rewards/margins": 1.03805410861969, "rewards/rejected": -5.051919937133789, "step": 4240 }, { "epoch": 0.28, "grad_norm": 11.3125, "learning_rate": 4.532497761765522e-06, "logits/chosen": -0.7837321758270264, "logits/rejected": -0.8828266263008118, "logps/chosen": -535.7445068359375, "logps/rejected": -638.7557373046875, "loss": 0.5342, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4627833366394043, "rewards/margins": 1.082082986831665, "rewards/rejected": -3.5448660850524902, "step": 4250 }, { "epoch": 0.28, "grad_norm": 39.75, "learning_rate": 4.529167540346617e-06, "logits/chosen": -1.183983564376831, "logits/rejected": -0.4465523362159729, "logps/chosen": -465.2965393066406, "logps/rejected": -554.4577026367188, "loss": 0.4643, "rewards/accuracies": 0.75, "rewards/chosen": -2.076396942138672, "rewards/margins": 1.2877867221832275, "rewards/rejected": -3.3641839027404785, "step": 4260 }, { "epoch": 0.28, "grad_norm": 12.875, "learning_rate": 4.525826732256561e-06, "logits/chosen": -1.5301613807678223, "logits/rejected": -0.9538251757621765, "logps/chosen": -583.697021484375, "logps/rejected": -672.251220703125, "loss": 0.4599, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.4067509174346924, "rewards/margins": 1.1943697929382324, "rewards/rejected": -3.6011204719543457, "step": 4270 }, { "epoch": 0.28, "grad_norm": 7.78125, "learning_rate": 4.522475354925178e-06, "logits/chosen": -0.8281643986701965, "logits/rejected": -0.6987124681472778, "logps/chosen": -503.9212951660156, "logps/rejected": -613.6614990234375, "loss": 0.5319, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.366013288497925, "rewards/margins": 1.1796926259994507, "rewards/rejected": -3.545706272125244, "step": 4280 }, { "epoch": 0.28, "grad_norm": 10.6875, "learning_rate": 4.519113425837437e-06, "logits/chosen": -0.9132779836654663, "logits/rejected": -0.2604519724845886, "logps/chosen": -584.8671264648438, "logps/rejected": -694.4030151367188, "loss": 0.6601, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.500218629837036, "rewards/margins": 0.9692384004592896, "rewards/rejected": -4.469457149505615, "step": 4290 }, { "epoch": 0.28, "grad_norm": 85.0, "learning_rate": 4.515740962533356e-06, "logits/chosen": -1.3140188455581665, "logits/rejected": -0.2842263877391815, "logps/chosen": -575.7063598632812, "logps/rejected": -712.2975463867188, "loss": 0.4848, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.165140151977539, "rewards/margins": 1.5465600490570068, "rewards/rejected": -4.711699962615967, "step": 4300 }, { "epoch": 0.28, "eval_logits/chosen": -0.7814970016479492, "eval_logits/rejected": -0.3191834092140198, "eval_logps/chosen": -629.6202392578125, "eval_logps/rejected": -754.5432739257812, "eval_loss": 0.5795419216156006, "eval_rewards/accuracies": 0.7279999852180481, "eval_rewards/chosen": -3.65000581741333, "eval_rewards/margins": 1.4497097730636597, "eval_rewards/rejected": -5.099715232849121, "eval_runtime": 1081.8539, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 4300 }, { "epoch": 0.28, "grad_norm": 33.25, "learning_rate": 4.512357982607914e-06, "logits/chosen": -0.8379079103469849, "logits/rejected": 0.10242664813995361, "logps/chosen": -669.4542236328125, "logps/rejected": -742.5728759765625, "loss": 0.5393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.792752742767334, "rewards/margins": 1.1992783546447754, "rewards/rejected": -4.992030620574951, "step": 4310 }, { "epoch": 0.28, "grad_norm": 16.375, "learning_rate": 4.50896450371096e-06, "logits/chosen": -0.8042828440666199, "logits/rejected": -0.36978310346603394, "logps/chosen": -649.3548583984375, "logps/rejected": -816.6651000976562, "loss": 0.5086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.648757219314575, "rewards/margins": 1.6447460651397705, "rewards/rejected": -5.293503761291504, "step": 4320 }, { "epoch": 0.28, "grad_norm": 20.0, "learning_rate": 4.505560543547113e-06, "logits/chosen": -0.4796438217163086, "logits/rejected": -0.1679435819387436, "logps/chosen": -623.3433837890625, "logps/rejected": -734.9503173828125, "loss": 0.6317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.898946762084961, "rewards/margins": 1.3465676307678223, "rewards/rejected": -5.245513916015625, "step": 4330 }, { "epoch": 0.28, "grad_norm": 17.125, "learning_rate": 4.502146119875681e-06, "logits/chosen": -0.06768389791250229, "logits/rejected": -0.21989555656909943, "logps/chosen": -647.6182861328125, "logps/rejected": -844.0394287109375, "loss": 0.4013, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.9305813312530518, "rewards/margins": 1.9143091440200806, "rewards/rejected": -5.844890594482422, "step": 4340 }, { "epoch": 0.28, "grad_norm": 35.0, "learning_rate": 4.4987212505105606e-06, "logits/chosen": -0.45050114393234253, "logits/rejected": -0.3674498200416565, "logps/chosen": -625.1988525390625, "logps/rejected": -760.817626953125, "loss": 0.6257, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.9897208213806152, "rewards/margins": 1.4841670989990234, "rewards/rejected": -5.473887920379639, "step": 4350 }, { "epoch": 0.29, "grad_norm": 41.5, "learning_rate": 4.495285953320146e-06, "logits/chosen": -0.05475806072354317, "logits/rejected": -0.8858219385147095, "logps/chosen": -779.1076049804688, "logps/rejected": -795.4571533203125, "loss": 0.9393, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -5.019373416900635, "rewards/margins": 0.33503782749176025, "rewards/rejected": -5.3544111251831055, "step": 4360 }, { "epoch": 0.29, "grad_norm": 77.5, "learning_rate": 4.491840246227234e-06, "logits/chosen": -0.9339480400085449, "logits/rejected": -0.12557096779346466, "logps/chosen": -578.0057373046875, "logps/rejected": -664.7117309570312, "loss": 0.5712, "rewards/accuracies": 0.75, "rewards/chosen": -2.970064401626587, "rewards/margins": 1.531270980834961, "rewards/rejected": -4.5013346672058105, "step": 4370 }, { "epoch": 0.29, "grad_norm": 18.75, "learning_rate": 4.488384147208936e-06, "logits/chosen": -1.0364567041397095, "logits/rejected": -1.483686923980713, "logps/chosen": -603.8021240234375, "logps/rejected": -720.2344970703125, "loss": 0.5419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3384501934051514, "rewards/margins": 1.1985316276550293, "rewards/rejected": -4.53698205947876, "step": 4380 }, { "epoch": 0.29, "grad_norm": 22.875, "learning_rate": 4.4849176742965805e-06, "logits/chosen": -0.7043582797050476, "logits/rejected": -0.09282146394252777, "logps/chosen": -543.1373901367188, "logps/rejected": -638.6561889648438, "loss": 0.4215, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.787097454071045, "rewards/margins": 1.5514123439788818, "rewards/rejected": -4.338510036468506, "step": 4390 }, { "epoch": 0.29, "grad_norm": 23.0, "learning_rate": 4.481440845575616e-06, "logits/chosen": -0.3808385729789734, "logits/rejected": -0.681591808795929, "logps/chosen": -579.3113403320312, "logps/rejected": -751.4989013671875, "loss": 0.4623, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.23478364944458, "rewards/margins": 1.7022383213043213, "rewards/rejected": -4.937021732330322, "step": 4400 }, { "epoch": 0.29, "eval_logits/chosen": -0.8597971200942993, "eval_logits/rejected": -0.3936343193054199, "eval_logps/chosen": -616.423583984375, "eval_logps/rejected": -746.6427001953125, "eval_loss": 0.5920248031616211, "eval_rewards/accuracies": 0.718999981880188, "eval_rewards/chosen": -3.518040180206299, "eval_rewards/margins": 1.502670168876648, "eval_rewards/rejected": -5.020709991455078, "eval_runtime": 1081.7865, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 4400 }, { "epoch": 0.29, "grad_norm": 11.9375, "learning_rate": 4.477953679185523e-06, "logits/chosen": -0.6713194847106934, "logits/rejected": -0.3537663221359253, "logps/chosen": -706.6163330078125, "logps/rejected": -770.3363037109375, "loss": 0.7546, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.141589164733887, "rewards/margins": 0.9560039639472961, "rewards/rejected": -5.097593307495117, "step": 4410 }, { "epoch": 0.29, "grad_norm": 40.5, "learning_rate": 4.4744561933197125e-06, "logits/chosen": -0.9083160161972046, "logits/rejected": -0.07630319893360138, "logps/chosen": -616.6812744140625, "logps/rejected": -735.6864013671875, "loss": 0.6813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.6705093383789062, "rewards/margins": 1.2893263101577759, "rewards/rejected": -4.959836006164551, "step": 4420 }, { "epoch": 0.29, "grad_norm": 28.75, "learning_rate": 4.470948406225439e-06, "logits/chosen": -0.8009995222091675, "logits/rejected": -0.8567732572555542, "logps/chosen": -556.5758666992188, "logps/rejected": -637.6351928710938, "loss": 0.6843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1794521808624268, "rewards/margins": 0.7183443307876587, "rewards/rejected": -3.897796630859375, "step": 4430 }, { "epoch": 0.29, "grad_norm": 6.5625, "learning_rate": 4.4674303362037e-06, "logits/chosen": -1.120936632156372, "logits/rejected": -0.5086898803710938, "logps/chosen": -571.8004760742188, "logps/rejected": -652.0510864257812, "loss": 0.5568, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.723245143890381, "rewards/margins": 1.411926507949829, "rewards/rejected": -4.135171413421631, "step": 4440 }, { "epoch": 0.29, "grad_norm": 15.4375, "learning_rate": 4.463902001609139e-06, "logits/chosen": -0.9869173169136047, "logits/rejected": -1.069544792175293, "logps/chosen": -524.5968017578125, "logps/rejected": -608.2014770507812, "loss": 0.6635, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.6611359119415283, "rewards/margins": 0.6758316159248352, "rewards/rejected": -3.3369674682617188, "step": 4450 }, { "epoch": 0.29, "grad_norm": 35.75, "learning_rate": 4.460363420849956e-06, "logits/chosen": -1.3604652881622314, "logits/rejected": -1.0074611902236938, "logps/chosen": -505.79248046875, "logps/rejected": -514.6629638671875, "loss": 0.6637, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.2508456707000732, "rewards/margins": 0.5542214512825012, "rewards/rejected": -2.805067300796509, "step": 4460 }, { "epoch": 0.29, "grad_norm": 19.125, "learning_rate": 4.456814612387803e-06, "logits/chosen": -1.5499746799468994, "logits/rejected": -1.0545423030853271, "logps/chosen": -491.5321350097656, "logps/rejected": -582.6185302734375, "loss": 0.4174, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0741610527038574, "rewards/margins": 1.3437223434448242, "rewards/rejected": -3.4178833961486816, "step": 4470 }, { "epoch": 0.29, "grad_norm": 11.0625, "learning_rate": 4.453255594737698e-06, "logits/chosen": -1.1010749340057373, "logits/rejected": -0.3453108072280884, "logps/chosen": -528.232177734375, "logps/rejected": -722.3978881835938, "loss": 0.4009, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.174903392791748, "rewards/margins": 1.7281837463378906, "rewards/rejected": -4.903087615966797, "step": 4480 }, { "epoch": 0.29, "grad_norm": 16.0, "learning_rate": 4.449686386467918e-06, "logits/chosen": -0.6093188524246216, "logits/rejected": -0.5423480272293091, "logps/chosen": -608.8286743164062, "logps/rejected": -707.93505859375, "loss": 0.6309, "rewards/accuracies": 0.625, "rewards/chosen": -3.4494385719299316, "rewards/margins": 1.3332111835479736, "rewards/rejected": -4.782649993896484, "step": 4490 }, { "epoch": 0.29, "grad_norm": 19.75, "learning_rate": 4.4461070061999115e-06, "logits/chosen": -1.0900073051452637, "logits/rejected": -0.7849153876304626, "logps/chosen": -625.6123657226562, "logps/rejected": -803.8609619140625, "loss": 0.4432, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.6813857555389404, "rewards/margins": 1.7808783054351807, "rewards/rejected": -5.462264060974121, "step": 4500 }, { "epoch": 0.29, "eval_logits/chosen": -0.7167415618896484, "eval_logits/rejected": -0.2693716287612915, "eval_logps/chosen": -662.1546630859375, "eval_logps/rejected": -792.8453369140625, "eval_loss": 0.5776159763336182, "eval_rewards/accuracies": 0.734000027179718, "eval_rewards/chosen": -3.975350856781006, "eval_rewards/margins": 1.5073858499526978, "eval_rewards/rejected": -5.4827375411987305, "eval_runtime": 1081.8534, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 4500 }, { "epoch": 0.3, "grad_norm": 29.125, "learning_rate": 4.4425174726081935e-06, "logits/chosen": -0.5025515556335449, "logits/rejected": -0.3688552975654602, "logps/chosen": -724.7239990234375, "logps/rejected": -872.1954345703125, "loss": 0.8416, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.464038848876953, "rewards/margins": 1.2156445980072021, "rewards/rejected": -5.679683208465576, "step": 4510 }, { "epoch": 0.3, "grad_norm": 86.5, "learning_rate": 4.438917804420252e-06, "logits/chosen": -0.3808960020542145, "logits/rejected": 0.17778636515140533, "logps/chosen": -712.8274536132812, "logps/rejected": -771.2440185546875, "loss": 0.8585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.684253692626953, "rewards/margins": 1.0335614681243896, "rewards/rejected": -5.717815399169922, "step": 4520 }, { "epoch": 0.3, "grad_norm": 13.875, "learning_rate": 4.435308020416451e-06, "logits/chosen": -0.7072274088859558, "logits/rejected": 0.04207498952746391, "logps/chosen": -702.3142700195312, "logps/rejected": -771.0267333984375, "loss": 0.6103, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.249428749084473, "rewards/margins": 1.4505321979522705, "rewards/rejected": -5.699961185455322, "step": 4530 }, { "epoch": 0.3, "grad_norm": 49.75, "learning_rate": 4.431688139429931e-06, "logits/chosen": -1.144795536994934, "logits/rejected": -0.566633403301239, "logps/chosen": -656.24365234375, "logps/rejected": -693.17431640625, "loss": 0.5379, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.8311080932617188, "rewards/margins": 1.1204969882965088, "rewards/rejected": -4.951605319976807, "step": 4540 }, { "epoch": 0.3, "grad_norm": 28.375, "learning_rate": 4.428058180346508e-06, "logits/chosen": -0.9058189392089844, "logits/rejected": -0.3182491362094879, "logps/chosen": -615.2515869140625, "logps/rejected": -718.3168334960938, "loss": 0.4542, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3246147632598877, "rewards/margins": 1.4877204895019531, "rewards/rejected": -4.812335014343262, "step": 4550 }, { "epoch": 0.3, "grad_norm": 34.25, "learning_rate": 4.424418162104582e-06, "logits/chosen": -0.5500428080558777, "logits/rejected": -0.17935840785503387, "logps/chosen": -568.5914306640625, "logps/rejected": -647.4351806640625, "loss": 0.4627, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.361078977584839, "rewards/margins": 1.0587937831878662, "rewards/rejected": -4.419872760772705, "step": 4560 }, { "epoch": 0.3, "grad_norm": 27.625, "learning_rate": 4.420768103695033e-06, "logits/chosen": -1.3160474300384521, "logits/rejected": -0.18910571932792664, "logps/chosen": -682.4427490234375, "logps/rejected": -729.5177612304688, "loss": 0.5662, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.5700602531433105, "rewards/margins": 1.2253577709197998, "rewards/rejected": -4.795417785644531, "step": 4570 }, { "epoch": 0.3, "grad_norm": 31.0, "learning_rate": 4.417108024161121e-06, "logits/chosen": -1.2332794666290283, "logits/rejected": -0.33161863684654236, "logps/chosen": -644.4576416015625, "logps/rejected": -761.5418701171875, "loss": 0.4599, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4773967266082764, "rewards/margins": 1.6219943761825562, "rewards/rejected": -5.099390983581543, "step": 4580 }, { "epoch": 0.3, "grad_norm": 20.25, "learning_rate": 4.413437942598391e-06, "logits/chosen": -1.3692491054534912, "logits/rejected": -0.4652097821235657, "logps/chosen": -704.6348876953125, "logps/rejected": -705.0117797851562, "loss": 0.6848, "rewards/accuracies": 0.625, "rewards/chosen": -4.033935070037842, "rewards/margins": 0.7854772210121155, "rewards/rejected": -4.8194122314453125, "step": 4590 }, { "epoch": 0.3, "grad_norm": 7.21875, "learning_rate": 4.40975787815457e-06, "logits/chosen": -0.858391284942627, "logits/rejected": -0.43409404158592224, "logps/chosen": -619.6019287109375, "logps/rejected": -801.150146484375, "loss": 0.577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.9776368141174316, "rewards/margins": 1.5999071598052979, "rewards/rejected": -5.577544212341309, "step": 4600 }, { "epoch": 0.3, "eval_logits/chosen": -1.0688438415527344, "eval_logits/rejected": -0.6869731545448303, "eval_logps/chosen": -631.0772705078125, "eval_logps/rejected": -746.00927734375, "eval_loss": 0.553403913974762, "eval_rewards/accuracies": 0.722000002861023, "eval_rewards/chosen": -3.664576768875122, "eval_rewards/margins": 1.3498002290725708, "eval_rewards/rejected": -5.014377117156982, "eval_runtime": 1082.1799, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 4600 }, { "epoch": 0.3, "grad_norm": 5.96875, "learning_rate": 4.406067850029469e-06, "logits/chosen": -0.8175787925720215, "logits/rejected": -1.0713331699371338, "logps/chosen": -595.6356201171875, "logps/rejected": -722.7973022460938, "loss": 0.6337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.2231719493865967, "rewards/margins": 1.2218053340911865, "rewards/rejected": -4.444977283477783, "step": 4610 }, { "epoch": 0.3, "grad_norm": 18.0, "learning_rate": 4.402367877474881e-06, "logits/chosen": -1.1315735578536987, "logits/rejected": 0.17930561304092407, "logps/chosen": -653.0616455078125, "logps/rejected": -857.4161987304688, "loss": 0.371, "rewards/accuracies": 0.75, "rewards/chosen": -3.518317461013794, "rewards/margins": 3.087574005126953, "rewards/rejected": -6.605891227722168, "step": 4620 }, { "epoch": 0.3, "grad_norm": 4.375, "learning_rate": 4.398657979794481e-06, "logits/chosen": -0.8940743207931519, "logits/rejected": -0.1416626274585724, "logps/chosen": -653.8370361328125, "logps/rejected": -789.305419921875, "loss": 0.4692, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.9124972820281982, "rewards/margins": 1.9897149801254272, "rewards/rejected": -5.902213096618652, "step": 4630 }, { "epoch": 0.3, "grad_norm": 47.75, "learning_rate": 4.394938176343729e-06, "logits/chosen": -0.8182951211929321, "logits/rejected": -0.7380988001823425, "logps/chosen": -635.1827392578125, "logps/rejected": -743.8736572265625, "loss": 0.556, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.06431245803833, "rewards/margins": 1.4313973188400269, "rewards/rejected": -5.495709419250488, "step": 4640 }, { "epoch": 0.3, "grad_norm": 9.5625, "learning_rate": 4.391208486529762e-06, "logits/chosen": -0.7760597467422485, "logits/rejected": -0.616553783416748, "logps/chosen": -693.7631225585938, "logps/rejected": -865.7893676757812, "loss": 0.4857, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.482192516326904, "rewards/margins": 1.8230196237564087, "rewards/rejected": -6.305211544036865, "step": 4650 }, { "epoch": 0.3, "grad_norm": 9.75, "learning_rate": 4.387468929811299e-06, "logits/chosen": -0.48033612966537476, "logits/rejected": -0.6604604125022888, "logps/chosen": -739.6536254882812, "logps/rejected": -977.2340698242188, "loss": 0.4245, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.7327375411987305, "rewards/margins": 2.143516778945923, "rewards/rejected": -6.876254081726074, "step": 4660 }, { "epoch": 0.31, "grad_norm": 17.625, "learning_rate": 4.383719525698537e-06, "logits/chosen": -0.27727872133255005, "logits/rejected": 0.4713972210884094, "logps/chosen": -833.1067504882812, "logps/rejected": -968.11083984375, "loss": 0.5866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.199848651885986, "rewards/margins": 1.7400710582733154, "rewards/rejected": -7.939919471740723, "step": 4670 }, { "epoch": 0.31, "grad_norm": 26.5, "learning_rate": 4.3799602937530464e-06, "logits/chosen": -0.8035605549812317, "logits/rejected": -0.4954621195793152, "logps/chosen": -882.8875732421875, "logps/rejected": -971.5751953125, "loss": 0.8416, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.116240501403809, "rewards/margins": 1.0033018589019775, "rewards/rejected": -7.119542598724365, "step": 4680 }, { "epoch": 0.31, "grad_norm": 12.5, "learning_rate": 4.376191253587676e-06, "logits/chosen": -0.7129504680633545, "logits/rejected": -0.21814504265785217, "logps/chosen": -835.11279296875, "logps/rejected": -896.0750732421875, "loss": 0.6275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.115592002868652, "rewards/margins": 0.9461033940315247, "rewards/rejected": -7.061694145202637, "step": 4690 }, { "epoch": 0.31, "grad_norm": 12.5625, "learning_rate": 4.372412424866444e-06, "logits/chosen": 0.08478070050477982, "logits/rejected": 0.48023366928100586, "logps/chosen": -836.9601440429688, "logps/rejected": -1029.5086669921875, "loss": 0.4871, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.202696800231934, "rewards/margins": 2.0506973266601562, "rewards/rejected": -8.25339412689209, "step": 4700 }, { "epoch": 0.31, "eval_logits/chosen": -0.640370786190033, "eval_logits/rejected": -0.2580258250236511, "eval_logps/chosen": -877.8546752929688, "eval_logps/rejected": -984.7041015625, "eval_loss": 0.5627450942993164, "eval_rewards/accuracies": 0.7124999761581421, "eval_rewards/chosen": -6.132349967956543, "eval_rewards/margins": 1.2689738273620605, "eval_rewards/rejected": -7.401324272155762, "eval_runtime": 1081.9583, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 4700 }, { "epoch": 0.31, "grad_norm": 16.875, "learning_rate": 4.36862382730444e-06, "logits/chosen": -0.5066614747047424, "logits/rejected": -0.3082544207572937, "logps/chosen": -836.1112060546875, "logps/rejected": -907.6394653320312, "loss": 0.7419, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.868655204772949, "rewards/margins": 0.7657414078712463, "rewards/rejected": -6.634397029876709, "step": 4710 }, { "epoch": 0.31, "grad_norm": 22.5, "learning_rate": 4.364825480667716e-06, "logits/chosen": -0.9130336046218872, "logits/rejected": -0.3516978919506073, "logps/chosen": -717.4490966796875, "logps/rejected": -783.9661865234375, "loss": 0.5127, "rewards/accuracies": 0.75, "rewards/chosen": -5.155701637268066, "rewards/margins": 0.8118181228637695, "rewards/rejected": -5.967519760131836, "step": 4720 }, { "epoch": 0.31, "grad_norm": 10.125, "learning_rate": 4.361017404773192e-06, "logits/chosen": -0.21449141204357147, "logits/rejected": -0.7119446396827698, "logps/chosen": -690.1495971679688, "logps/rejected": -865.8206176757812, "loss": 0.4481, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.641330242156982, "rewards/margins": 1.5248507261276245, "rewards/rejected": -6.166181564331055, "step": 4730 }, { "epoch": 0.31, "grad_norm": 13.1875, "learning_rate": 4.3571996194885465e-06, "logits/chosen": -1.3118547201156616, "logits/rejected": -0.5208414196968079, "logps/chosen": -818.0625, "logps/rejected": -832.9998168945312, "loss": 0.7387, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.4055609703063965, "rewards/margins": 0.7044488787651062, "rewards/rejected": -6.110010147094727, "step": 4740 }, { "epoch": 0.31, "grad_norm": 30.625, "learning_rate": 4.353372144732112e-06, "logits/chosen": -0.9146745800971985, "logits/rejected": -1.0047132968902588, "logps/chosen": -699.4454345703125, "logps/rejected": -823.0584106445312, "loss": 0.4543, "rewards/accuracies": 0.75, "rewards/chosen": -4.41893196105957, "rewards/margins": 1.2925301790237427, "rewards/rejected": -5.71146297454834, "step": 4750 }, { "epoch": 0.31, "grad_norm": 12.9375, "learning_rate": 4.349535000472775e-06, "logits/chosen": -1.190919280052185, "logits/rejected": -0.9429049491882324, "logps/chosen": -702.9523315429688, "logps/rejected": -899.0533447265625, "loss": 0.5672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.222666263580322, "rewards/margins": 1.6562492847442627, "rewards/rejected": -5.878914833068848, "step": 4760 }, { "epoch": 0.31, "grad_norm": 11.125, "learning_rate": 4.3456882067298726e-06, "logits/chosen": -1.0891876220703125, "logits/rejected": -0.3005872368812561, "logps/chosen": -722.7308349609375, "logps/rejected": -800.7787475585938, "loss": 0.6559, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.501214981079102, "rewards/margins": 0.8556938171386719, "rewards/rejected": -5.356908321380615, "step": 4770 }, { "epoch": 0.31, "grad_norm": 8.9375, "learning_rate": 4.341831783573082e-06, "logits/chosen": -0.4730401933193207, "logits/rejected": -0.21259017288684845, "logps/chosen": -735.0071411132812, "logps/rejected": -867.7970581054688, "loss": 0.5294, "rewards/accuracies": 0.75, "rewards/chosen": -5.000101089477539, "rewards/margins": 1.2712856531143188, "rewards/rejected": -6.271387100219727, "step": 4780 }, { "epoch": 0.31, "grad_norm": 5.9375, "learning_rate": 4.33796575112232e-06, "logits/chosen": -1.3343031406402588, "logits/rejected": -0.9671639204025269, "logps/chosen": -723.875244140625, "logps/rejected": -808.33154296875, "loss": 0.5632, "rewards/accuracies": 0.75, "rewards/chosen": -4.298136234283447, "rewards/margins": 1.0438514947891235, "rewards/rejected": -5.341987609863281, "step": 4790 }, { "epoch": 0.31, "grad_norm": 30.875, "learning_rate": 4.3340901295476405e-06, "logits/chosen": -1.0257949829101562, "logits/rejected": -0.8210827112197876, "logps/chosen": -710.6309814453125, "logps/rejected": -788.1726684570312, "loss": 0.5773, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.3972625732421875, "rewards/margins": 1.0487065315246582, "rewards/rejected": -5.445969104766846, "step": 4800 }, { "epoch": 0.31, "eval_logits/chosen": -1.1398816108703613, "eval_logits/rejected": -0.8069629073143005, "eval_logps/chosen": -673.2337646484375, "eval_logps/rejected": -790.9176025390625, "eval_loss": 0.5535557866096497, "eval_rewards/accuracies": 0.7245000004768372, "eval_rewards/chosen": -4.086142539978027, "eval_rewards/margins": 1.3773170709609985, "eval_rewards/rejected": -5.463459491729736, "eval_runtime": 1082.9829, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 4800 }, { "epoch": 0.31, "grad_norm": 8.6875, "learning_rate": 4.330204939069121e-06, "logits/chosen": -1.268600583076477, "logits/rejected": -0.4731263220310211, "logps/chosen": -626.3063354492188, "logps/rejected": -808.0425415039062, "loss": 0.4545, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.729457139968872, "rewards/margins": 1.9569820165634155, "rewards/rejected": -5.68643856048584, "step": 4810 }, { "epoch": 0.32, "grad_norm": 30.0, "learning_rate": 4.326310199956768e-06, "logits/chosen": -1.2281825542449951, "logits/rejected": -0.6629482507705688, "logps/chosen": -666.4266357421875, "logps/rejected": -832.2318115234375, "loss": 0.3094, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.8956210613250732, "rewards/margins": 1.8514516353607178, "rewards/rejected": -5.747072696685791, "step": 4820 }, { "epoch": 0.32, "grad_norm": 46.75, "learning_rate": 4.322405932530402e-06, "logits/chosen": -1.138364553451538, "logits/rejected": -0.9049383997917175, "logps/chosen": -707.4952392578125, "logps/rejected": -784.2113037109375, "loss": 0.4496, "rewards/accuracies": 0.75, "rewards/chosen": -3.737819194793701, "rewards/margins": 1.4234949350357056, "rewards/rejected": -5.161314487457275, "step": 4830 }, { "epoch": 0.32, "grad_norm": 6.28125, "learning_rate": 4.318492157159557e-06, "logits/chosen": -0.3926137089729309, "logits/rejected": -0.4637214243412018, "logps/chosen": -775.1493530273438, "logps/rejected": -1034.8909912109375, "loss": 0.3389, "rewards/accuracies": 0.875, "rewards/chosen": -4.9645915031433105, "rewards/margins": 2.2930541038513184, "rewards/rejected": -7.257645606994629, "step": 4840 }, { "epoch": 0.32, "grad_norm": 59.75, "learning_rate": 4.31456889426337e-06, "logits/chosen": -0.6292157769203186, "logits/rejected": -0.15493735671043396, "logps/chosen": -769.8458251953125, "logps/rejected": -1052.250732421875, "loss": 0.3967, "rewards/accuracies": 0.75, "rewards/chosen": -5.2628984451293945, "rewards/margins": 2.7214412689208984, "rewards/rejected": -7.984339237213135, "step": 4850 }, { "epoch": 0.32, "grad_norm": 26.125, "learning_rate": 4.310636164310478e-06, "logits/chosen": -0.25988227128982544, "logits/rejected": 0.5974297523498535, "logps/chosen": -944.4521484375, "logps/rejected": -1129.2073974609375, "loss": 0.6888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7.028240203857422, "rewards/margins": 2.1717493534088135, "rewards/rejected": -9.19998836517334, "step": 4860 }, { "epoch": 0.32, "grad_norm": 37.5, "learning_rate": 4.3066939878189115e-06, "logits/chosen": -0.20646479725837708, "logits/rejected": 0.35336896777153015, "logps/chosen": -916.5020751953125, "logps/rejected": -1224.7227783203125, "loss": 0.5414, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.523590087890625, "rewards/margins": 3.318316698074341, "rewards/rejected": -9.841906547546387, "step": 4870 }, { "epoch": 0.32, "grad_norm": 21.5, "learning_rate": 4.3027423853559845e-06, "logits/chosen": -0.2247236669063568, "logits/rejected": -0.14924836158752441, "logps/chosen": -795.49658203125, "logps/rejected": -1208.628662109375, "loss": 0.3001, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.756331443786621, "rewards/margins": 3.8679556846618652, "rewards/rejected": -9.624287605285645, "step": 4880 }, { "epoch": 0.32, "grad_norm": 9.5, "learning_rate": 4.298781377538188e-06, "logits/chosen": -0.1809079349040985, "logits/rejected": -0.05475571006536484, "logps/chosen": -850.2587890625, "logps/rejected": -1100.3106689453125, "loss": 0.5806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.022488594055176, "rewards/margins": 2.39762544631958, "rewards/rejected": -8.420113563537598, "step": 4890 }, { "epoch": 0.32, "grad_norm": 26.125, "learning_rate": 4.294810985031084e-06, "logits/chosen": -0.5664982795715332, "logits/rejected": -0.44837671518325806, "logps/chosen": -708.5640869140625, "logps/rejected": -1049.0264892578125, "loss": 0.429, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.846693992614746, "rewards/margins": 2.992805242538452, "rewards/rejected": -7.839498043060303, "step": 4900 }, { "epoch": 0.32, "eval_logits/chosen": -0.8331770896911621, "eval_logits/rejected": -0.452583372592926, "eval_logps/chosen": -734.55908203125, "eval_logps/rejected": -894.9046630859375, "eval_loss": 0.6206492185592651, "eval_rewards/accuracies": 0.7235000133514404, "eval_rewards/chosen": -4.699395656585693, "eval_rewards/margins": 1.8039348125457764, "eval_rewards/rejected": -6.503330230712891, "eval_runtime": 1082.9414, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 4900 }, { "epoch": 0.32, "grad_norm": 31.0, "learning_rate": 4.290831228549196e-06, "logits/chosen": -0.817001461982727, "logits/rejected": -0.6347015500068665, "logps/chosen": -723.3236083984375, "logps/rejected": -840.0537109375, "loss": 0.6699, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.535086631774902, "rewards/margins": 1.488999605178833, "rewards/rejected": -6.024085998535156, "step": 4910 }, { "epoch": 0.32, "grad_norm": 11.0, "learning_rate": 4.286842128855904e-06, "logits/chosen": -0.8380616307258606, "logits/rejected": -0.6018103957176208, "logps/chosen": -735.3355712890625, "logps/rejected": -836.6847534179688, "loss": 0.6984, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.490719795227051, "rewards/margins": 1.2654566764831543, "rewards/rejected": -5.756176471710205, "step": 4920 }, { "epoch": 0.32, "grad_norm": 54.25, "learning_rate": 4.282843706763329e-06, "logits/chosen": -1.1329466104507446, "logits/rejected": -0.6955538392066956, "logps/chosen": -671.918212890625, "logps/rejected": -773.244873046875, "loss": 0.5761, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.8309829235076904, "rewards/margins": 1.5670149326324463, "rewards/rejected": -5.397997856140137, "step": 4930 }, { "epoch": 0.32, "grad_norm": 10.25, "learning_rate": 4.278835983132236e-06, "logits/chosen": -1.0745939016342163, "logits/rejected": -0.8655071258544922, "logps/chosen": -568.5635986328125, "logps/rejected": -639.2700805664062, "loss": 0.4582, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2207207679748535, "rewards/margins": 1.3497354984283447, "rewards/rejected": -4.570456027984619, "step": 4940 }, { "epoch": 0.32, "grad_norm": 19.625, "learning_rate": 4.274818978871912e-06, "logits/chosen": -1.128919005393982, "logits/rejected": -1.0048322677612305, "logps/chosen": -708.3638916015625, "logps/rejected": -900.8346557617188, "loss": 0.4363, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.022771835327148, "rewards/margins": 1.5249364376068115, "rewards/rejected": -5.547708034515381, "step": 4950 }, { "epoch": 0.32, "grad_norm": 22.75, "learning_rate": 4.270792714940067e-06, "logits/chosen": -0.8766831159591675, "logits/rejected": -0.7380250692367554, "logps/chosen": -756.6174926757812, "logps/rejected": -1002.4476318359375, "loss": 0.5431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.169392108917236, "rewards/margins": 1.8629051446914673, "rewards/rejected": -7.032296657562256, "step": 4960 }, { "epoch": 0.33, "grad_norm": 26.5, "learning_rate": 4.266757212342721e-06, "logits/chosen": -0.4121031165122986, "logits/rejected": 0.37714892625808716, "logps/chosen": -802.7855834960938, "logps/rejected": -881.6461181640625, "loss": 0.5901, "rewards/accuracies": 0.75, "rewards/chosen": -5.422067165374756, "rewards/margins": 1.4689706563949585, "rewards/rejected": -6.891038417816162, "step": 4970 }, { "epoch": 0.33, "grad_norm": 22.875, "learning_rate": 4.262712492134094e-06, "logits/chosen": -0.9211975336074829, "logits/rejected": -0.5096105933189392, "logps/chosen": -764.8045043945312, "logps/rejected": -898.4817504882812, "loss": 0.7701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.014372825622559, "rewards/margins": 1.6449394226074219, "rewards/rejected": -6.6593122482299805, "step": 4980 }, { "epoch": 0.33, "grad_norm": 3.984375, "learning_rate": 4.2586585754164935e-06, "logits/chosen": -0.8496034741401672, "logits/rejected": -0.09724292904138565, "logps/chosen": -757.1512451171875, "logps/rejected": -910.4989013671875, "loss": 0.4702, "rewards/accuracies": 0.75, "rewards/chosen": -5.067817687988281, "rewards/margins": 1.9143221378326416, "rewards/rejected": -6.982139587402344, "step": 4990 }, { "epoch": 0.33, "grad_norm": 20.5, "learning_rate": 4.254595483340212e-06, "logits/chosen": -0.7568883895874023, "logits/rejected": -0.5595308542251587, "logps/chosen": -732.3966674804688, "logps/rejected": -980.7216796875, "loss": 0.483, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.17746114730835, "rewards/margins": 1.7236629724502563, "rewards/rejected": -6.901124477386475, "step": 5000 }, { "epoch": 0.33, "eval_logits/chosen": -0.7938401103019714, "eval_logits/rejected": -0.40959909558296204, "eval_logps/chosen": -795.9951171875, "eval_logps/rejected": -911.3126831054688, "eval_loss": 0.5429574251174927, "eval_rewards/accuracies": 0.7245000004768372, "eval_rewards/chosen": -5.313755512237549, "eval_rewards/margins": 1.3536555767059326, "eval_rewards/rejected": -6.6674113273620605, "eval_runtime": 1082.1833, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 5000 }, { "epoch": 0.33, "grad_norm": 38.5, "learning_rate": 4.25052323710341e-06, "logits/chosen": -0.6790348887443542, "logits/rejected": -0.18591298162937164, "logps/chosen": -844.7146606445312, "logps/rejected": -930.0905151367188, "loss": 0.4914, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.477452278137207, "rewards/margins": 0.9788287281990051, "rewards/rejected": -6.456280708312988, "step": 5010 }, { "epoch": 0.33, "grad_norm": 20.0, "learning_rate": 4.2464418579520085e-06, "logits/chosen": -0.8184741139411926, "logits/rejected": -0.49955207109451294, "logps/chosen": -739.2804565429688, "logps/rejected": -819.4275512695312, "loss": 0.5037, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.855145454406738, "rewards/margins": 1.15547776222229, "rewards/rejected": -6.010622978210449, "step": 5020 }, { "epoch": 0.33, "grad_norm": 27.625, "learning_rate": 4.242351367179575e-06, "logits/chosen": -0.6734864115715027, "logits/rejected": -0.586290717124939, "logps/chosen": -744.8939208984375, "logps/rejected": -899.6710815429688, "loss": 0.4459, "rewards/accuracies": 0.75, "rewards/chosen": -4.767654895782471, "rewards/margins": 1.555180311203003, "rewards/rejected": -6.3228349685668945, "step": 5030 }, { "epoch": 0.33, "grad_norm": 21.875, "learning_rate": 4.238251786127216e-06, "logits/chosen": -0.9706589579582214, "logits/rejected": -0.29847899079322815, "logps/chosen": -790.8389282226562, "logps/rejected": -842.5296630859375, "loss": 0.5895, "rewards/accuracies": 0.75, "rewards/chosen": -4.698607444763184, "rewards/margins": 1.3878940343856812, "rewards/rejected": -6.086501121520996, "step": 5040 }, { "epoch": 0.33, "grad_norm": 29.375, "learning_rate": 4.234143136183465e-06, "logits/chosen": -0.10557955503463745, "logits/rejected": -0.2599624693393707, "logps/chosen": -665.2359619140625, "logps/rejected": -855.2097778320312, "loss": 0.5206, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.663005828857422, "rewards/margins": 1.7516084909439087, "rewards/rejected": -6.414613246917725, "step": 5050 }, { "epoch": 0.33, "grad_norm": 105.0, "learning_rate": 4.230025438784169e-06, "logits/chosen": -1.0614486932754517, "logits/rejected": -0.6720927953720093, "logps/chosen": -735.9578857421875, "logps/rejected": -921.2384033203125, "loss": 0.4275, "rewards/accuracies": 0.75, "rewards/chosen": -4.153075218200684, "rewards/margins": 1.6241191625595093, "rewards/rejected": -5.777194976806641, "step": 5060 }, { "epoch": 0.33, "grad_norm": 35.25, "learning_rate": 4.22589871541238e-06, "logits/chosen": -1.079143762588501, "logits/rejected": -0.573811411857605, "logps/chosen": -713.51904296875, "logps/rejected": -786.7986450195312, "loss": 0.9009, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.534701824188232, "rewards/margins": 0.7654833197593689, "rewards/rejected": -5.300185203552246, "step": 5070 }, { "epoch": 0.33, "grad_norm": 6.4375, "learning_rate": 4.221762987598237e-06, "logits/chosen": -1.105019211769104, "logits/rejected": 0.4622572064399719, "logps/chosen": -771.4699096679688, "logps/rejected": -809.6135864257812, "loss": 0.7207, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.758582592010498, "rewards/margins": 1.2618314027786255, "rewards/rejected": -6.020414352416992, "step": 5080 }, { "epoch": 0.33, "grad_norm": 22.875, "learning_rate": 4.21761827691886e-06, "logits/chosen": -1.195389986038208, "logits/rejected": -0.0938594788312912, "logps/chosen": -622.4869995117188, "logps/rejected": -720.3295288085938, "loss": 0.3799, "rewards/accuracies": 0.75, "rewards/chosen": -3.814146041870117, "rewards/margins": 1.6486718654632568, "rewards/rejected": -5.462817192077637, "step": 5090 }, { "epoch": 0.33, "grad_norm": 13.5625, "learning_rate": 4.213464604998235e-06, "logits/chosen": -1.3423573970794678, "logits/rejected": -0.29991769790649414, "logps/chosen": -646.9683837890625, "logps/rejected": -766.5751342773438, "loss": 0.3309, "rewards/accuracies": 0.875, "rewards/chosen": -3.809772491455078, "rewards/margins": 1.926416039466858, "rewards/rejected": -5.7361884117126465, "step": 5100 }, { "epoch": 0.33, "eval_logits/chosen": -0.9408469200134277, "eval_logits/rejected": -0.504178524017334, "eval_logps/chosen": -711.0602416992188, "eval_logps/rejected": -833.669677734375, "eval_loss": 0.5672723650932312, "eval_rewards/accuracies": 0.7149999737739563, "eval_rewards/chosen": -4.464406967163086, "eval_rewards/margins": 1.426573634147644, "eval_rewards/rejected": -5.890980243682861, "eval_runtime": 1083.0613, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 5100 }, { "epoch": 0.33, "grad_norm": 61.0, "learning_rate": 4.209301993507099e-06, "logits/chosen": -1.0680879354476929, "logits/rejected": -0.2673115134239197, "logps/chosen": -732.3485717773438, "logps/rejected": -931.9034423828125, "loss": 0.4268, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.552649021148682, "rewards/margins": 2.1921520233154297, "rewards/rejected": -6.7448015213012695, "step": 5110 }, { "epoch": 0.33, "grad_norm": 91.0, "learning_rate": 4.2051304641628295e-06, "logits/chosen": -0.4419058859348297, "logits/rejected": -0.43781739473342896, "logps/chosen": -727.6237182617188, "logps/rejected": -920.49072265625, "loss": 0.6261, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.831849098205566, "rewards/margins": 1.5132211446762085, "rewards/rejected": -6.345069885253906, "step": 5120 }, { "epoch": 0.34, "grad_norm": 34.75, "learning_rate": 4.200950038729335e-06, "logits/chosen": -1.312351107597351, "logits/rejected": -0.9825354814529419, "logps/chosen": -700.2615966796875, "logps/rejected": -751.70849609375, "loss": 0.6362, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.177218914031982, "rewards/margins": 0.9473382830619812, "rewards/rejected": -5.1245574951171875, "step": 5130 }, { "epoch": 0.34, "grad_norm": 128.0, "learning_rate": 4.19676073901693e-06, "logits/chosen": -1.096785306930542, "logits/rejected": -0.45196810364723206, "logps/chosen": -691.6744995117188, "logps/rejected": -853.4054565429688, "loss": 0.6816, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.236144065856934, "rewards/margins": 1.9784491062164307, "rewards/rejected": -6.214592933654785, "step": 5140 }, { "epoch": 0.34, "grad_norm": 12.3125, "learning_rate": 4.192562586882233e-06, "logits/chosen": -1.0074760913848877, "logits/rejected": -0.18417097628116608, "logps/chosen": -699.8735961914062, "logps/rejected": -893.02978515625, "loss": 0.3352, "rewards/accuracies": 0.875, "rewards/chosen": -4.228342533111572, "rewards/margins": 2.711921215057373, "rewards/rejected": -6.940263271331787, "step": 5150 }, { "epoch": 0.34, "grad_norm": 14.0, "learning_rate": 4.188355604228047e-06, "logits/chosen": -1.0675033330917358, "logits/rejected": -0.5531347990036011, "logps/chosen": -829.6004028320312, "logps/rejected": -984.6746826171875, "loss": 0.4828, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.197609901428223, "rewards/margins": 1.9620662927627563, "rewards/rejected": -7.159677028656006, "step": 5160 }, { "epoch": 0.34, "grad_norm": 5.625, "learning_rate": 4.184139813003246e-06, "logits/chosen": -0.7708001136779785, "logits/rejected": 0.08678726851940155, "logps/chosen": -903.2506103515625, "logps/rejected": -896.5926513671875, "loss": 1.1491, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.287803649902344, "rewards/margins": 0.7200108766555786, "rewards/rejected": -7.007813930511475, "step": 5170 }, { "epoch": 0.34, "grad_norm": 17.625, "learning_rate": 4.179915235202659e-06, "logits/chosen": -1.209047555923462, "logits/rejected": -0.5405303239822388, "logps/chosen": -771.7014770507812, "logps/rejected": -906.0218505859375, "loss": 0.4166, "rewards/accuracies": 0.75, "rewards/chosen": -4.778886318206787, "rewards/margins": 1.7696717977523804, "rewards/rejected": -6.548558235168457, "step": 5180 }, { "epoch": 0.34, "grad_norm": 45.5, "learning_rate": 4.175681892866958e-06, "logits/chosen": -0.8204858899116516, "logits/rejected": -0.40891942381858826, "logps/chosen": -766.3446044921875, "logps/rejected": -872.3656005859375, "loss": 0.6982, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.039416313171387, "rewards/margins": 1.4276368618011475, "rewards/rejected": -6.467053413391113, "step": 5190 }, { "epoch": 0.34, "grad_norm": 9.9375, "learning_rate": 4.1714398080825425e-06, "logits/chosen": -1.3470443487167358, "logits/rejected": -0.7673690319061279, "logps/chosen": -759.630859375, "logps/rejected": -861.4703369140625, "loss": 0.5417, "rewards/accuracies": 0.75, "rewards/chosen": -4.593935012817383, "rewards/margins": 1.3017276525497437, "rewards/rejected": -5.895662784576416, "step": 5200 }, { "epoch": 0.34, "eval_logits/chosen": -1.1457747220993042, "eval_logits/rejected": -0.7977614402770996, "eval_logps/chosen": -661.1136474609375, "eval_logps/rejected": -773.7584838867188, "eval_loss": 0.5361440181732178, "eval_rewards/accuracies": 0.7279999852180481, "eval_rewards/chosen": -3.9649405479431152, "eval_rewards/margins": 1.3269275426864624, "eval_rewards/rejected": -5.291868209838867, "eval_runtime": 1082.9731, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 5200 }, { "epoch": 0.34, "grad_norm": 8.8125, "learning_rate": 4.167189002981421e-06, "logits/chosen": -1.40774405002594, "logits/rejected": -0.8532537221908569, "logps/chosen": -622.4767456054688, "logps/rejected": -752.5357055664062, "loss": 0.3934, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.5531158447265625, "rewards/margins": 1.4387696981430054, "rewards/rejected": -4.991885185241699, "step": 5210 }, { "epoch": 0.34, "grad_norm": 15.375, "learning_rate": 4.162929499741102e-06, "logits/chosen": -1.4106112718582153, "logits/rejected": -0.6664021611213684, "logps/chosen": -584.5672607421875, "logps/rejected": -757.0716552734375, "loss": 0.302, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.4604804515838623, "rewards/margins": 2.0725929737091064, "rewards/rejected": -5.533073425292969, "step": 5220 }, { "epoch": 0.34, "grad_norm": 14.375, "learning_rate": 4.15866132058447e-06, "logits/chosen": -1.044559121131897, "logits/rejected": -0.7239853739738464, "logps/chosen": -730.513916015625, "logps/rejected": -814.0767822265625, "loss": 0.6314, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.710904121398926, "rewards/margins": 1.3517636060714722, "rewards/rejected": -6.062668323516846, "step": 5230 }, { "epoch": 0.34, "grad_norm": 14.875, "learning_rate": 4.1543844877796775e-06, "logits/chosen": -1.0062978267669678, "logits/rejected": -0.4208384156227112, "logps/chosen": -747.843505859375, "logps/rejected": -804.8162231445312, "loss": 0.6168, "rewards/accuracies": 0.625, "rewards/chosen": -4.8410725593566895, "rewards/margins": 1.0686309337615967, "rewards/rejected": -5.909703254699707, "step": 5240 }, { "epoch": 0.34, "grad_norm": 29.25, "learning_rate": 4.150099023640023e-06, "logits/chosen": -0.8938889503479004, "logits/rejected": -0.703365683555603, "logps/chosen": -682.5992431640625, "logps/rejected": -809.3043823242188, "loss": 0.4141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.370444297790527, "rewards/margins": 1.6058063507080078, "rewards/rejected": -5.976251125335693, "step": 5250 }, { "epoch": 0.34, "grad_norm": 39.5, "learning_rate": 4.145804950523837e-06, "logits/chosen": -1.066474199295044, "logits/rejected": -0.7008960247039795, "logps/chosen": -651.1998291015625, "logps/rejected": -752.4002075195312, "loss": 0.5638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.159370422363281, "rewards/margins": 1.1453665494918823, "rewards/rejected": -5.304737091064453, "step": 5260 }, { "epoch": 0.34, "grad_norm": 10.75, "learning_rate": 4.141502290834367e-06, "logits/chosen": -0.8945801854133606, "logits/rejected": -0.7822094559669495, "logps/chosen": -627.2889404296875, "logps/rejected": -721.8226318359375, "loss": 0.4859, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.6150360107421875, "rewards/margins": 1.2147738933563232, "rewards/rejected": -4.829809665679932, "step": 5270 }, { "epoch": 0.35, "grad_norm": 27.125, "learning_rate": 4.137191067019657e-06, "logits/chosen": -0.9590651392936707, "logits/rejected": -0.6113361120223999, "logps/chosen": -696.3848876953125, "logps/rejected": -759.2896728515625, "loss": 0.6188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.282772064208984, "rewards/margins": 0.8681036233901978, "rewards/rejected": -5.150876045227051, "step": 5280 }, { "epoch": 0.35, "grad_norm": 21.625, "learning_rate": 4.1328713015724315e-06, "logits/chosen": -0.6479889750480652, "logits/rejected": -0.7619959712028503, "logps/chosen": -746.4632568359375, "logps/rejected": -763.807861328125, "loss": 0.6066, "rewards/accuracies": 0.75, "rewards/chosen": -4.304412364959717, "rewards/margins": 1.0026192665100098, "rewards/rejected": -5.307032108306885, "step": 5290 }, { "epoch": 0.35, "grad_norm": 14.0625, "learning_rate": 4.128543017029981e-06, "logits/chosen": -0.9625337719917297, "logits/rejected": 0.6454368829727173, "logps/chosen": -727.755615234375, "logps/rejected": -782.9365234375, "loss": 0.505, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.981069087982178, "rewards/margins": 1.239211082458496, "rewards/rejected": -6.220280647277832, "step": 5300 }, { "epoch": 0.35, "eval_logits/chosen": -0.7747477293014526, "eval_logits/rejected": -0.38483861088752747, "eval_logps/chosen": -780.5414428710938, "eval_logps/rejected": -891.477783203125, "eval_loss": 0.5394361615180969, "eval_rewards/accuracies": 0.734000027179718, "eval_rewards/chosen": -5.159217834472656, "eval_rewards/margins": 1.3098435401916504, "eval_rewards/rejected": -6.469061374664307, "eval_runtime": 1082.9527, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 5300 }, { "epoch": 0.35, "grad_norm": 30.625, "learning_rate": 4.124206235974042e-06, "logits/chosen": -1.0986745357513428, "logits/rejected": -1.0615088939666748, "logps/chosen": -763.5336303710938, "logps/rejected": -843.90185546875, "loss": 0.8345, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.957098960876465, "rewards/margins": 0.8888531923294067, "rewards/rejected": -5.845952033996582, "step": 5310 }, { "epoch": 0.35, "grad_norm": 40.25, "learning_rate": 4.119860981030677e-06, "logits/chosen": -0.6999555826187134, "logits/rejected": -0.9455103874206543, "logps/chosen": -690.185302734375, "logps/rejected": -821.6904296875, "loss": 0.4529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.976836204528809, "rewards/margins": 1.3072313070297241, "rewards/rejected": -6.284067630767822, "step": 5320 }, { "epoch": 0.35, "grad_norm": 24.0, "learning_rate": 4.115507274870162e-06, "logits/chosen": -0.8978142738342285, "logits/rejected": -0.5966888666152954, "logps/chosen": -827.7893676757812, "logps/rejected": -856.18017578125, "loss": 0.6403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.382994651794434, "rewards/margins": 0.788791298866272, "rewards/rejected": -6.171786308288574, "step": 5330 }, { "epoch": 0.35, "grad_norm": 33.0, "learning_rate": 4.111145140206862e-06, "logits/chosen": -0.6828926801681519, "logits/rejected": -0.688707172870636, "logps/chosen": -730.4295654296875, "logps/rejected": -763.6844482421875, "loss": 0.66, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.014399528503418, "rewards/margins": 0.4639320373535156, "rewards/rejected": -5.478331565856934, "step": 5340 }, { "epoch": 0.35, "grad_norm": 17.25, "learning_rate": 4.106774599799118e-06, "logits/chosen": -1.1645032167434692, "logits/rejected": -0.786527156829834, "logps/chosen": -769.6987915039062, "logps/rejected": -879.6158447265625, "loss": 0.405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.207489967346191, "rewards/margins": 1.5547535419464111, "rewards/rejected": -5.762242317199707, "step": 5350 }, { "epoch": 0.35, "grad_norm": 19.25, "learning_rate": 4.1023956764491255e-06, "logits/chosen": -0.8519840240478516, "logits/rejected": -0.4784340262413025, "logps/chosen": -673.81591796875, "logps/rejected": -761.4054565429688, "loss": 0.5059, "rewards/accuracies": 0.75, "rewards/chosen": -4.309661865234375, "rewards/margins": 1.076310396194458, "rewards/rejected": -5.385972023010254, "step": 5360 }, { "epoch": 0.35, "grad_norm": 4.65625, "learning_rate": 4.098008393002816e-06, "logits/chosen": -1.1698604822158813, "logits/rejected": -0.5089825391769409, "logps/chosen": -700.197021484375, "logps/rejected": -865.1195068359375, "loss": 0.4047, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.287665367126465, "rewards/margins": 1.768963098526001, "rewards/rejected": -6.056628227233887, "step": 5370 }, { "epoch": 0.35, "grad_norm": 13.5625, "learning_rate": 4.093612772349735e-06, "logits/chosen": -0.9481998682022095, "logits/rejected": -0.3462110459804535, "logps/chosen": -761.7136840820312, "logps/rejected": -845.4225463867188, "loss": 0.6937, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.617761135101318, "rewards/margins": 1.0949015617370605, "rewards/rejected": -5.712663173675537, "step": 5380 }, { "epoch": 0.35, "grad_norm": 14.75, "learning_rate": 4.089208837422929e-06, "logits/chosen": -0.8234946131706238, "logits/rejected": -0.6700694561004639, "logps/chosen": -553.3179931640625, "logps/rejected": -663.1610107421875, "loss": 0.654, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.464266300201416, "rewards/margins": 1.0769484043121338, "rewards/rejected": -4.541214942932129, "step": 5390 }, { "epoch": 0.35, "grad_norm": 21.125, "learning_rate": 4.084796611198821e-06, "logits/chosen": -1.4149115085601807, "logits/rejected": -0.5947950482368469, "logps/chosen": -589.1494140625, "logps/rejected": -820.8596801757812, "loss": 0.2418, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.1126530170440674, "rewards/margins": 2.802931547164917, "rewards/rejected": -5.915584564208984, "step": 5400 }, { "epoch": 0.35, "eval_logits/chosen": -1.0946416854858398, "eval_logits/rejected": -0.7070560455322266, "eval_logps/chosen": -637.05322265625, "eval_logps/rejected": -754.3560180664062, "eval_loss": 0.5435988903045654, "eval_rewards/accuracies": 0.7319999933242798, "eval_rewards/chosen": -3.7243361473083496, "eval_rewards/margins": 1.3735060691833496, "eval_rewards/rejected": -5.097842693328857, "eval_runtime": 1082.2669, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 5400 }, { "epoch": 0.35, "grad_norm": 13.1875, "learning_rate": 4.080376116697089e-06, "logits/chosen": -1.2464878559112549, "logits/rejected": -0.45614171028137207, "logps/chosen": -632.4989013671875, "logps/rejected": -825.7425537109375, "loss": 0.3765, "rewards/accuracies": 0.875, "rewards/chosen": -3.8400719165802, "rewards/margins": 2.326674222946167, "rewards/rejected": -6.166746139526367, "step": 5410 }, { "epoch": 0.35, "grad_norm": 28.0, "learning_rate": 4.075947376980553e-06, "logits/chosen": -1.1727924346923828, "logits/rejected": -0.6635844707489014, "logps/chosen": -605.8697509765625, "logps/rejected": -794.8741455078125, "loss": 0.4659, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3149075508117676, "rewards/margins": 1.8521528244018555, "rewards/rejected": -5.167060375213623, "step": 5420 }, { "epoch": 0.36, "grad_norm": 31.5, "learning_rate": 4.071510415155048e-06, "logits/chosen": -1.0222463607788086, "logits/rejected": -0.5831054449081421, "logps/chosen": -692.3656616210938, "logps/rejected": -732.0311889648438, "loss": 0.6269, "rewards/accuracies": 0.75, "rewards/chosen": -3.9079864025115967, "rewards/margins": 1.1146795749664307, "rewards/rejected": -5.022665977478027, "step": 5430 }, { "epoch": 0.36, "grad_norm": 13.6875, "learning_rate": 4.0670652543693055e-06, "logits/chosen": -0.9036405682563782, "logits/rejected": -0.9506808519363403, "logps/chosen": -621.09375, "logps/rejected": -683.8199462890625, "loss": 0.5294, "rewards/accuracies": 0.75, "rewards/chosen": -3.540756940841675, "rewards/margins": 1.2119077444076538, "rewards/rejected": -4.752664566040039, "step": 5440 }, { "epoch": 0.36, "grad_norm": 23.0, "learning_rate": 4.062611917814834e-06, "logits/chosen": -0.734868586063385, "logits/rejected": -0.12260384857654572, "logps/chosen": -628.942138671875, "logps/rejected": -716.887939453125, "loss": 0.3957, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.695653200149536, "rewards/margins": 1.4442816972732544, "rewards/rejected": -5.13993501663208, "step": 5450 }, { "epoch": 0.36, "grad_norm": 28.125, "learning_rate": 4.058150428725797e-06, "logits/chosen": -1.1034178733825684, "logits/rejected": -1.174552083015442, "logps/chosen": -635.3003540039062, "logps/rejected": -729.0916748046875, "loss": 0.5408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.8463757038116455, "rewards/margins": 1.1043140888214111, "rewards/rejected": -4.950689792633057, "step": 5460 }, { "epoch": 0.36, "grad_norm": 7.625, "learning_rate": 4.053680810378892e-06, "logits/chosen": -0.6613587141036987, "logits/rejected": -1.3470520973205566, "logps/chosen": -619.7008666992188, "logps/rejected": -830.4635620117188, "loss": 0.5175, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.956433057785034, "rewards/margins": 1.5075929164886475, "rewards/rejected": -5.464025497436523, "step": 5470 }, { "epoch": 0.36, "grad_norm": 10.125, "learning_rate": 4.049203086093225e-06, "logits/chosen": -1.0525099039077759, "logits/rejected": -0.9128379821777344, "logps/chosen": -670.6365966796875, "logps/rejected": -709.1093139648438, "loss": 0.6514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7151882648468018, "rewards/margins": 0.8191520571708679, "rewards/rejected": -4.5343403816223145, "step": 5480 }, { "epoch": 0.36, "grad_norm": 16.5, "learning_rate": 4.0447172792302e-06, "logits/chosen": -1.1726263761520386, "logits/rejected": -0.999742329120636, "logps/chosen": -643.7369384765625, "logps/rejected": -716.4229736328125, "loss": 0.5861, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.6155014038085938, "rewards/margins": 0.8870798349380493, "rewards/rejected": -4.5025811195373535, "step": 5490 }, { "epoch": 0.36, "grad_norm": 19.5, "learning_rate": 4.0402234131933835e-06, "logits/chosen": -1.6205737590789795, "logits/rejected": -0.9880961179733276, "logps/chosen": -723.0994873046875, "logps/rejected": -757.4866943359375, "loss": 0.5596, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9295127391815186, "rewards/margins": 0.9556741714477539, "rewards/rejected": -4.885186672210693, "step": 5500 }, { "epoch": 0.36, "eval_logits/chosen": -1.1252115964889526, "eval_logits/rejected": -0.8061349987983704, "eval_logps/chosen": -679.8906860351562, "eval_logps/rejected": -785.1954345703125, "eval_loss": 0.5356577038764954, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -4.152711391448975, "eval_rewards/margins": 1.253526210784912, "eval_rewards/rejected": -5.4062371253967285, "eval_runtime": 1081.9998, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 5500 }, { "epoch": 0.36, "grad_norm": 13.25, "learning_rate": 4.035721511428391e-06, "logits/chosen": -1.3481359481811523, "logits/rejected": -0.7308357954025269, "logps/chosen": -803.7606201171875, "logps/rejected": -776.6546020507812, "loss": 0.6564, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.331362724304199, "rewards/margins": 0.7839199304580688, "rewards/rejected": -5.115283012390137, "step": 5510 }, { "epoch": 0.36, "grad_norm": 24.625, "learning_rate": 4.0312115974227635e-06, "logits/chosen": -0.9684137105941772, "logits/rejected": -0.9282635450363159, "logps/chosen": -647.1978759765625, "logps/rejected": -850.00146484375, "loss": 0.4134, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.309957981109619, "rewards/margins": 1.7413936853408813, "rewards/rejected": -6.051351547241211, "step": 5520 }, { "epoch": 0.36, "grad_norm": 27.125, "learning_rate": 4.026693694705843e-06, "logits/chosen": -1.1259434223175049, "logits/rejected": -0.28758206963539124, "logps/chosen": -860.9661865234375, "logps/rejected": -890.5445556640625, "loss": 0.7584, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -5.472034454345703, "rewards/margins": 0.9003497362136841, "rewards/rejected": -6.372384071350098, "step": 5530 }, { "epoch": 0.36, "grad_norm": 28.5, "learning_rate": 4.022167826848649e-06, "logits/chosen": -1.1170629262924194, "logits/rejected": -0.8015773892402649, "logps/chosen": -828.4481201171875, "logps/rejected": -929.7276611328125, "loss": 0.6161, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.684155464172363, "rewards/margins": 1.134021282196045, "rewards/rejected": -6.81817626953125, "step": 5540 }, { "epoch": 0.36, "grad_norm": 19.0, "learning_rate": 4.0176340174637585e-06, "logits/chosen": -1.111710786819458, "logits/rejected": -0.6490559577941895, "logps/chosen": -796.2223510742188, "logps/rejected": -890.1608276367188, "loss": 0.3638, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.205319404602051, "rewards/margins": 1.7864841222763062, "rewards/rejected": -6.9918036460876465, "step": 5550 }, { "epoch": 0.36, "grad_norm": 15.25, "learning_rate": 4.013092290205182e-06, "logits/chosen": -1.2498246431350708, "logits/rejected": -1.2065069675445557, "logps/chosen": -680.7439575195312, "logps/rejected": -786.2127685546875, "loss": 0.6462, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.0445733070373535, "rewards/margins": 1.158097743988037, "rewards/rejected": -5.202670574188232, "step": 5560 }, { "epoch": 0.36, "grad_norm": 56.75, "learning_rate": 4.008542668768239e-06, "logits/chosen": -1.7138067483901978, "logits/rejected": -1.3165632486343384, "logps/chosen": -739.278076171875, "logps/rejected": -764.2333984375, "loss": 0.4548, "rewards/accuracies": 0.75, "rewards/chosen": -3.661093235015869, "rewards/margins": 1.5710136890411377, "rewards/rejected": -5.2321062088012695, "step": 5570 }, { "epoch": 0.37, "grad_norm": 43.5, "learning_rate": 4.003985176889434e-06, "logits/chosen": -1.6422256231307983, "logits/rejected": -1.410667896270752, "logps/chosen": -635.7860107421875, "logps/rejected": -716.4530029296875, "loss": 0.5869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.5633082389831543, "rewards/margins": 0.9920655488967896, "rewards/rejected": -4.555373668670654, "step": 5580 }, { "epoch": 0.37, "grad_norm": 33.25, "learning_rate": 3.999419838346335e-06, "logits/chosen": -1.3131074905395508, "logits/rejected": -1.17564058303833, "logps/chosen": -638.2360229492188, "logps/rejected": -847.1317138671875, "loss": 0.4779, "rewards/accuracies": 0.75, "rewards/chosen": -3.7024288177490234, "rewards/margins": 1.883131742477417, "rewards/rejected": -5.585560321807861, "step": 5590 }, { "epoch": 0.37, "grad_norm": 12.125, "learning_rate": 3.994846676957448e-06, "logits/chosen": -1.1160207986831665, "logits/rejected": -0.7958636283874512, "logps/chosen": -604.6561889648438, "logps/rejected": -713.4833374023438, "loss": 0.6177, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.630154848098755, "rewards/margins": 1.079613447189331, "rewards/rejected": -4.7097673416137695, "step": 5600 }, { "epoch": 0.37, "eval_logits/chosen": -1.5595428943634033, "eval_logits/rejected": -1.299677848815918, "eval_logps/chosen": -557.4889526367188, "eval_logps/rejected": -660.9725952148438, "eval_loss": 0.5369346141815186, "eval_rewards/accuracies": 0.7315000295639038, "eval_rewards/chosen": -2.9286932945251465, "eval_rewards/margins": 1.2353159189224243, "eval_rewards/rejected": -4.164009094238281, "eval_runtime": 1082.1247, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 5600 }, { "epoch": 0.37, "grad_norm": 32.75, "learning_rate": 3.99026571658209e-06, "logits/chosen": -1.4458223581314087, "logits/rejected": -1.1404035091400146, "logps/chosen": -567.6046142578125, "logps/rejected": -561.4676513671875, "loss": 0.8373, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.0399718284606934, "rewards/margins": 0.4125136733055115, "rewards/rejected": -3.4524855613708496, "step": 5610 }, { "epoch": 0.37, "grad_norm": 5.71875, "learning_rate": 3.985676981120271e-06, "logits/chosen": -1.689714789390564, "logits/rejected": -1.6327787637710571, "logps/chosen": -371.0877685546875, "logps/rejected": -503.96917724609375, "loss": 0.4223, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8772996664047241, "rewards/margins": 1.2993767261505127, "rewards/rejected": -3.1766762733459473, "step": 5620 }, { "epoch": 0.37, "grad_norm": 9.4375, "learning_rate": 3.981080494512564e-06, "logits/chosen": -1.8023675680160522, "logits/rejected": -1.403769850730896, "logps/chosen": -488.49725341796875, "logps/rejected": -591.573974609375, "loss": 0.5625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2437989711761475, "rewards/margins": 1.0076730251312256, "rewards/rejected": -3.251471996307373, "step": 5630 }, { "epoch": 0.37, "grad_norm": 31.5, "learning_rate": 3.97647628073998e-06, "logits/chosen": -1.6415698528289795, "logits/rejected": -1.403395175933838, "logps/chosen": -498.3099060058594, "logps/rejected": -569.7965087890625, "loss": 0.5229, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2824854850769043, "rewards/margins": 1.0072901248931885, "rewards/rejected": -3.2897751331329346, "step": 5640 }, { "epoch": 0.37, "grad_norm": 8.4375, "learning_rate": 3.971864363823848e-06, "logits/chosen": -1.7067874670028687, "logits/rejected": -1.4452556371688843, "logps/chosen": -434.39617919921875, "logps/rejected": -498.5501403808594, "loss": 0.6427, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.401578903198242, "rewards/margins": 0.7744816541671753, "rewards/rejected": -3.176060438156128, "step": 5650 }, { "epoch": 0.37, "grad_norm": 15.4375, "learning_rate": 3.967244767825681e-06, "logits/chosen": -1.8505491018295288, "logits/rejected": -1.803896188735962, "logps/chosen": -550.6358642578125, "logps/rejected": -618.2778930664062, "loss": 0.5385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0667710304260254, "rewards/margins": 0.8015181422233582, "rewards/rejected": -2.8682892322540283, "step": 5660 }, { "epoch": 0.37, "grad_norm": 16.875, "learning_rate": 3.962617516847063e-06, "logits/chosen": -1.8252313137054443, "logits/rejected": -1.7519195079803467, "logps/chosen": -480.7511291503906, "logps/rejected": -586.4462890625, "loss": 0.4783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0754568576812744, "rewards/margins": 1.2748162746429443, "rewards/rejected": -3.3502731323242188, "step": 5670 }, { "epoch": 0.37, "grad_norm": 50.75, "learning_rate": 3.957982635029509e-06, "logits/chosen": -1.8326904773712158, "logits/rejected": -1.3705637454986572, "logps/chosen": -481.5323791503906, "logps/rejected": -554.6358642578125, "loss": 0.4297, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5128815174102783, "rewards/margins": 1.3504384756088257, "rewards/rejected": -3.8633198738098145, "step": 5680 }, { "epoch": 0.37, "grad_norm": 48.5, "learning_rate": 3.9533401465543505e-06, "logits/chosen": -1.489105224609375, "logits/rejected": -1.203481912612915, "logps/chosen": -669.8865356445312, "logps/rejected": -829.7503662109375, "loss": 0.4124, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.795656681060791, "rewards/margins": 1.9159505367279053, "rewards/rejected": -5.711607456207275, "step": 5690 }, { "epoch": 0.37, "grad_norm": 23.875, "learning_rate": 3.948690075642602e-06, "logits/chosen": -1.1138761043548584, "logits/rejected": -1.2881919145584106, "logps/chosen": -699.5226440429688, "logps/rejected": -799.2484741210938, "loss": 0.563, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.647792816162109, "rewards/margins": 1.117394208908081, "rewards/rejected": -5.7651872634887695, "step": 5700 }, { "epoch": 0.37, "eval_logits/chosen": -1.4799950122833252, "eval_logits/rejected": -1.1995702981948853, "eval_logps/chosen": -659.2139892578125, "eval_logps/rejected": -794.9143676757812, "eval_loss": 0.5817158222198486, "eval_rewards/accuracies": 0.7335000038146973, "eval_rewards/chosen": -3.945944309234619, "eval_rewards/margins": 1.5574827194213867, "eval_rewards/rejected": -5.503426551818848, "eval_runtime": 1082.6712, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 5700 }, { "epoch": 0.37, "grad_norm": 37.75, "learning_rate": 3.944032446554839e-06, "logits/chosen": -1.0514599084854126, "logits/rejected": -1.4115526676177979, "logps/chosen": -641.8993530273438, "logps/rejected": -765.6351318359375, "loss": 0.7676, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.047459602355957, "rewards/margins": 0.9766621589660645, "rewards/rejected": -5.024121284484863, "step": 5710 }, { "epoch": 0.37, "grad_norm": 11.875, "learning_rate": 3.9393672835910705e-06, "logits/chosen": -1.4756271839141846, "logits/rejected": -0.7825470566749573, "logps/chosen": -619.8916015625, "logps/rejected": -690.2598876953125, "loss": 0.6072, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.3649303913116455, "rewards/margins": 1.4027009010314941, "rewards/rejected": -4.767631530761719, "step": 5720 }, { "epoch": 0.37, "grad_norm": 7.40625, "learning_rate": 3.9346946110906095e-06, "logits/chosen": -1.5663444995880127, "logits/rejected": -1.4145170450210571, "logps/chosen": -444.46722412109375, "logps/rejected": -619.8214111328125, "loss": 0.6261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.471475839614868, "rewards/margins": 1.340376377105713, "rewards/rejected": -3.8118526935577393, "step": 5730 }, { "epoch": 0.38, "grad_norm": 29.5, "learning_rate": 3.93001445343195e-06, "logits/chosen": -1.3641194105148315, "logits/rejected": -1.3650639057159424, "logps/chosen": -412.07000732421875, "logps/rejected": -609.1380004882812, "loss": 0.3181, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.278404712677002, "rewards/margins": 1.7936311960220337, "rewards/rejected": -4.072035789489746, "step": 5740 }, { "epoch": 0.38, "grad_norm": 9.375, "learning_rate": 3.925326835032636e-06, "logits/chosen": -1.5838943719863892, "logits/rejected": -1.5433645248413086, "logps/chosen": -583.2268676757812, "logps/rejected": -722.0751342773438, "loss": 0.4389, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.709766387939453, "rewards/margins": 1.796126365661621, "rewards/rejected": -4.505892753601074, "step": 5750 }, { "epoch": 0.38, "grad_norm": 16.125, "learning_rate": 3.9206317803491375e-06, "logits/chosen": -1.4320387840270996, "logits/rejected": -1.267703652381897, "logps/chosen": -492.402099609375, "logps/rejected": -602.7713623046875, "loss": 0.4845, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8671345710754395, "rewards/margins": 1.3845067024230957, "rewards/rejected": -4.251641273498535, "step": 5760 }, { "epoch": 0.38, "grad_norm": 33.75, "learning_rate": 3.9159293138767214e-06, "logits/chosen": -1.477116346359253, "logits/rejected": -1.3006573915481567, "logps/chosen": -531.85546875, "logps/rejected": -660.1209106445312, "loss": 0.4674, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9067342281341553, "rewards/margins": 1.387483835220337, "rewards/rejected": -4.29421854019165, "step": 5770 }, { "epoch": 0.38, "grad_norm": 13.5, "learning_rate": 3.911219460149325e-06, "logits/chosen": -1.4525920152664185, "logits/rejected": -0.95831298828125, "logps/chosen": -618.361083984375, "logps/rejected": -766.6071166992188, "loss": 0.5656, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4423396587371826, "rewards/margins": 1.4323632717132568, "rewards/rejected": -4.8747029304504395, "step": 5780 }, { "epoch": 0.38, "grad_norm": 9.75, "learning_rate": 3.906502243739424e-06, "logits/chosen": -1.5503230094909668, "logits/rejected": -1.2176319360733032, "logps/chosen": -629.1749877929688, "logps/rejected": -681.2413940429688, "loss": 0.6979, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -3.314401149749756, "rewards/margins": 0.7875905632972717, "rewards/rejected": -4.101991653442383, "step": 5790 }, { "epoch": 0.38, "grad_norm": 25.625, "learning_rate": 3.9017776892579075e-06, "logits/chosen": -1.698019027709961, "logits/rejected": -1.1330796480178833, "logps/chosen": -437.50885009765625, "logps/rejected": -623.7738647460938, "loss": 0.4282, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5192713737487793, "rewards/margins": 1.771398901939392, "rewards/rejected": -4.290670394897461, "step": 5800 }, { "epoch": 0.38, "eval_logits/chosen": -1.6274447441101074, "eval_logits/rejected": -1.3725231885910034, "eval_logps/chosen": -557.9898681640625, "eval_logps/rejected": -673.3403930664062, "eval_loss": 0.5350064635276794, "eval_rewards/accuracies": 0.7304999828338623, "eval_rewards/chosen": -2.9337027072906494, "eval_rewards/margins": 1.353983998298645, "eval_rewards/rejected": -4.287686347961426, "eval_runtime": 1082.1269, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 5800 }, { "epoch": 0.38, "grad_norm": 2.609375, "learning_rate": 3.89704582135395e-06, "logits/chosen": -1.740778923034668, "logits/rejected": -1.8198730945587158, "logps/chosen": -612.5232543945312, "logps/rejected": -736.5966796875, "loss": 0.3853, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0238921642303467, "rewards/margins": 1.615530014038086, "rewards/rejected": -4.639422416687012, "step": 5810 }, { "epoch": 0.38, "grad_norm": 27.0, "learning_rate": 3.8923066647148835e-06, "logits/chosen": -1.657546043395996, "logits/rejected": -1.3283555507659912, "logps/chosen": -570.1175537109375, "logps/rejected": -715.1098022460938, "loss": 0.4112, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0730254650115967, "rewards/margins": 1.6333930492401123, "rewards/rejected": -4.706418514251709, "step": 5820 }, { "epoch": 0.38, "grad_norm": 11.8125, "learning_rate": 3.8875602440660635e-06, "logits/chosen": -1.5095189809799194, "logits/rejected": -1.4185593128204346, "logps/chosen": -623.71533203125, "logps/rejected": -838.8956298828125, "loss": 0.4626, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.6829471588134766, "rewards/margins": 2.0640056133270264, "rewards/rejected": -5.746953010559082, "step": 5830 }, { "epoch": 0.38, "grad_norm": 6.09375, "learning_rate": 3.882806584170747e-06, "logits/chosen": -1.3547556400299072, "logits/rejected": -1.0621776580810547, "logps/chosen": -532.5726318359375, "logps/rejected": -725.4853515625, "loss": 0.4857, "rewards/accuracies": 0.75, "rewards/chosen": -3.190798282623291, "rewards/margins": 1.9462690353393555, "rewards/rejected": -5.137066841125488, "step": 5840 }, { "epoch": 0.38, "grad_norm": 23.375, "learning_rate": 3.878045709829958e-06, "logits/chosen": -1.7893638610839844, "logits/rejected": -1.2638218402862549, "logps/chosen": -642.9067993164062, "logps/rejected": -726.5803833007812, "loss": 0.5418, "rewards/accuracies": 0.75, "rewards/chosen": -3.6192240715026855, "rewards/margins": 1.4222482442855835, "rewards/rejected": -5.041472434997559, "step": 5850 }, { "epoch": 0.38, "grad_norm": 14.375, "learning_rate": 3.873277645882362e-06, "logits/chosen": -1.3869431018829346, "logits/rejected": -1.3153343200683594, "logps/chosen": -580.6889038085938, "logps/rejected": -761.6790771484375, "loss": 0.4414, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.2940335273742676, "rewards/margins": 2.143383264541626, "rewards/rejected": -5.4374165534973145, "step": 5860 }, { "epoch": 0.38, "grad_norm": 14.9375, "learning_rate": 3.868502417204132e-06, "logits/chosen": -1.777470588684082, "logits/rejected": -1.4611170291900635, "logps/chosen": -642.6820068359375, "logps/rejected": -797.9207153320312, "loss": 0.4358, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6357054710388184, "rewards/margins": 2.031708240509033, "rewards/rejected": -5.66741418838501, "step": 5870 }, { "epoch": 0.38, "grad_norm": 25.375, "learning_rate": 3.863720048708821e-06, "logits/chosen": -1.4780681133270264, "logits/rejected": -1.129896640777588, "logps/chosen": -700.9755859375, "logps/rejected": -895.6798706054688, "loss": 0.6612, "rewards/accuracies": 0.75, "rewards/chosen": -4.251623153686523, "rewards/margins": 1.962694764137268, "rewards/rejected": -6.21431827545166, "step": 5880 }, { "epoch": 0.39, "grad_norm": 19.375, "learning_rate": 3.8589305653472355e-06, "logits/chosen": -1.6618798971176147, "logits/rejected": -1.192319631576538, "logps/chosen": -709.0529174804688, "logps/rejected": -769.61865234375, "loss": 0.5656, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.7200636863708496, "rewards/margins": 1.78572678565979, "rewards/rejected": -5.505789756774902, "step": 5890 }, { "epoch": 0.39, "grad_norm": 90.0, "learning_rate": 3.854133992107299e-06, "logits/chosen": -1.341066598892212, "logits/rejected": -1.0207972526550293, "logps/chosen": -628.7329711914062, "logps/rejected": -832.7430419921875, "loss": 0.4219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.00554084777832, "rewards/margins": 1.8663222789764404, "rewards/rejected": -5.87186336517334, "step": 5900 }, { "epoch": 0.39, "eval_logits/chosen": -1.4290486574172974, "eval_logits/rejected": -1.1561980247497559, "eval_logps/chosen": -646.8944091796875, "eval_logps/rejected": -790.7645263671875, "eval_loss": 0.5514551997184753, "eval_rewards/accuracies": 0.7400000095367432, "eval_rewards/chosen": -3.8227481842041016, "eval_rewards/margins": 1.6391799449920654, "eval_rewards/rejected": -5.461928844451904, "eval_runtime": 1082.9238, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 5900 }, { "epoch": 0.39, "grad_norm": 11.4375, "learning_rate": 3.8493303540139256e-06, "logits/chosen": -1.5411032438278198, "logits/rejected": -1.3670079708099365, "logps/chosen": -648.3821411132812, "logps/rejected": -850.3121337890625, "loss": 0.3531, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.4462647438049316, "rewards/margins": 2.3045997619628906, "rewards/rejected": -5.750864505767822, "step": 5910 }, { "epoch": 0.39, "grad_norm": 16.0, "learning_rate": 3.8445196761288905e-06, "logits/chosen": -1.4124703407287598, "logits/rejected": -1.2245404720306396, "logps/chosen": -652.748291015625, "logps/rejected": -762.3580932617188, "loss": 0.7334, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.239779949188232, "rewards/margins": 1.3376438617706299, "rewards/rejected": -5.577424049377441, "step": 5920 }, { "epoch": 0.39, "grad_norm": 10.5625, "learning_rate": 3.8397019835506925e-06, "logits/chosen": -1.5916811227798462, "logits/rejected": -1.157487154006958, "logps/chosen": -729.3059692382812, "logps/rejected": -887.3469848632812, "loss": 0.3879, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.380138397216797, "rewards/margins": 2.0478060245513916, "rewards/rejected": -6.427944183349609, "step": 5930 }, { "epoch": 0.39, "grad_norm": 20.5, "learning_rate": 3.834877301414432e-06, "logits/chosen": -1.2313817739486694, "logits/rejected": -1.5281293392181396, "logps/chosen": -704.5719604492188, "logps/rejected": -898.78466796875, "loss": 0.5466, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.596787452697754, "rewards/margins": 1.8373740911483765, "rewards/rejected": -6.434161186218262, "step": 5940 }, { "epoch": 0.39, "grad_norm": 25.625, "learning_rate": 3.8300456548916745e-06, "logits/chosen": -1.440124750137329, "logits/rejected": -1.347338080406189, "logps/chosen": -736.0311279296875, "logps/rejected": -864.0257568359375, "loss": 0.7241, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.00949764251709, "rewards/margins": 1.23239004611969, "rewards/rejected": -6.24188756942749, "step": 5950 }, { "epoch": 0.39, "grad_norm": 17.75, "learning_rate": 3.82520706919032e-06, "logits/chosen": -1.6062614917755127, "logits/rejected": -0.94788658618927, "logps/chosen": -980.64697265625, "logps/rejected": -950.9678955078125, "loss": 0.9468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.4084086418151855, "rewards/margins": 1.3759183883666992, "rewards/rejected": -6.784327507019043, "step": 5960 }, { "epoch": 0.39, "grad_norm": 13.375, "learning_rate": 3.820361569554472e-06, "logits/chosen": -1.707733154296875, "logits/rejected": -1.702752709388733, "logps/chosen": -732.2637939453125, "logps/rejected": -819.050048828125, "loss": 0.5507, "rewards/accuracies": 0.75, "rewards/chosen": -3.9252769947052, "rewards/margins": 1.2101728916168213, "rewards/rejected": -5.13545036315918, "step": 5970 }, { "epoch": 0.39, "grad_norm": 63.25, "learning_rate": 3.815509181264305e-06, "logits/chosen": -1.2528975009918213, "logits/rejected": -0.9130045175552368, "logps/chosen": -667.488037109375, "logps/rejected": -726.0441284179688, "loss": 0.6026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.347951412200928, "rewards/margins": 1.0135449171066284, "rewards/rejected": -5.361496925354004, "step": 5980 }, { "epoch": 0.39, "grad_norm": 3.953125, "learning_rate": 3.8106499296359356e-06, "logits/chosen": -0.9657360911369324, "logits/rejected": -1.0461463928222656, "logps/chosen": -581.2642211914062, "logps/rejected": -716.18212890625, "loss": 0.5299, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8028359413146973, "rewards/margins": 1.3897744417190552, "rewards/rejected": -5.192610263824463, "step": 5990 }, { "epoch": 0.39, "grad_norm": 21.375, "learning_rate": 3.8057838400212867e-06, "logits/chosen": -1.3983871936798096, "logits/rejected": -1.2182483673095703, "logps/chosen": -588.8099975585938, "logps/rejected": -709.674072265625, "loss": 0.6167, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4612064361572266, "rewards/margins": 1.216709017753601, "rewards/rejected": -4.677915096282959, "step": 6000 }, { "epoch": 0.39, "eval_logits/chosen": -1.5848238468170166, "eval_logits/rejected": -1.3564949035644531, "eval_logps/chosen": -591.4141845703125, "eval_logps/rejected": -704.3193359375, "eval_loss": 0.5245141386985779, "eval_rewards/accuracies": 0.737500011920929, "eval_rewards/chosen": -3.2679457664489746, "eval_rewards/margins": 1.3295302391052246, "eval_rewards/rejected": -4.597475528717041, "eval_runtime": 1083.0057, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 6000 }, { "epoch": 0.39, "grad_norm": 5.8125, "learning_rate": 3.8009109378079556e-06, "logits/chosen": -1.4836119413375854, "logits/rejected": -0.9885314106941223, "logps/chosen": -576.2704467773438, "logps/rejected": -708.4190673828125, "loss": 0.4585, "rewards/accuracies": 0.75, "rewards/chosen": -3.2755744457244873, "rewards/margins": 1.645939826965332, "rewards/rejected": -4.92151403427124, "step": 6010 }, { "epoch": 0.39, "grad_norm": 13.25, "learning_rate": 3.7960312484190835e-06, "logits/chosen": -1.167911171913147, "logits/rejected": -1.4538495540618896, "logps/chosen": -560.0538330078125, "logps/rejected": -730.6513671875, "loss": 0.6951, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.787022113800049, "rewards/margins": 1.178891897201538, "rewards/rejected": -4.965914249420166, "step": 6020 }, { "epoch": 0.39, "grad_norm": 6.9375, "learning_rate": 3.7911447973132237e-06, "logits/chosen": -1.5843948125839233, "logits/rejected": -1.1723289489746094, "logps/chosen": -578.4069213867188, "logps/rejected": -694.5243530273438, "loss": 0.3826, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9061009883880615, "rewards/margins": 1.6813024282455444, "rewards/rejected": -4.587403774261475, "step": 6030 }, { "epoch": 0.4, "grad_norm": 10.1875, "learning_rate": 3.7862516099842038e-06, "logits/chosen": -1.8603366613388062, "logits/rejected": -1.4774984121322632, "logps/chosen": -607.1301879882812, "logps/rejected": -677.8235473632812, "loss": 0.6075, "rewards/accuracies": 0.75, "rewards/chosen": -2.783193588256836, "rewards/margins": 1.1177361011505127, "rewards/rejected": -3.9009299278259277, "step": 6040 }, { "epoch": 0.4, "grad_norm": 49.25, "learning_rate": 3.7813517119609997e-06, "logits/chosen": -1.5467721223831177, "logits/rejected": -1.9304940700531006, "logps/chosen": -486.9471130371094, "logps/rejected": -573.2645874023438, "loss": 0.5777, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.631441831588745, "rewards/margins": 0.8855684399604797, "rewards/rejected": -3.51701021194458, "step": 6050 }, { "epoch": 0.4, "grad_norm": 8.0, "learning_rate": 3.7764451288075944e-06, "logits/chosen": -1.6885160207748413, "logits/rejected": -1.4086277484893799, "logps/chosen": -459.7559509277344, "logps/rejected": -579.6644287109375, "loss": 0.4131, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1427462100982666, "rewards/margins": 1.4685786962509155, "rewards/rejected": -3.6113250255584717, "step": 6060 }, { "epoch": 0.4, "grad_norm": 11.0, "learning_rate": 3.7715318861228533e-06, "logits/chosen": -1.5580532550811768, "logits/rejected": -1.191781997680664, "logps/chosen": -442.95977783203125, "logps/rejected": -527.2955932617188, "loss": 0.4695, "rewards/accuracies": 0.75, "rewards/chosen": -2.3964436054229736, "rewards/margins": 1.232130527496338, "rewards/rejected": -3.6285736560821533, "step": 6070 }, { "epoch": 0.4, "grad_norm": 16.75, "learning_rate": 3.7666120095403824e-06, "logits/chosen": -1.5407462120056152, "logits/rejected": -1.0308644771575928, "logps/chosen": -552.0963134765625, "logps/rejected": -679.0263671875, "loss": 0.4407, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.691760301589966, "rewards/margins": 1.562053918838501, "rewards/rejected": -4.253814697265625, "step": 6080 }, { "epoch": 0.4, "grad_norm": 6.34375, "learning_rate": 3.7616855247284e-06, "logits/chosen": -1.5308470726013184, "logits/rejected": -1.286475419998169, "logps/chosen": -535.9384155273438, "logps/rejected": -704.09716796875, "loss": 0.496, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9896767139434814, "rewards/margins": 1.7069756984710693, "rewards/rejected": -4.696652412414551, "step": 6090 }, { "epoch": 0.4, "grad_norm": 12.375, "learning_rate": 3.756752457389603e-06, "logits/chosen": -1.466505765914917, "logits/rejected": -1.3769114017486572, "logps/chosen": -574.1783447265625, "logps/rejected": -682.5731201171875, "loss": 0.5634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.346346378326416, "rewards/margins": 1.1510469913482666, "rewards/rejected": -4.4973931312561035, "step": 6100 }, { "epoch": 0.4, "eval_logits/chosen": -1.4960319995880127, "eval_logits/rejected": -1.2393951416015625, "eval_logps/chosen": -604.6244506835938, "eval_logps/rejected": -725.9063110351562, "eval_loss": 0.5365744233131409, "eval_rewards/accuracies": 0.7289999723434448, "eval_rewards/chosen": -3.400047779083252, "eval_rewards/margins": 1.4132988452911377, "eval_rewards/rejected": -4.813346862792969, "eval_runtime": 1081.837, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 6100 }, { "epoch": 0.4, "grad_norm": 28.375, "learning_rate": 3.7518128332610273e-06, "logits/chosen": -1.572556972503662, "logits/rejected": -1.3453798294067383, "logps/chosen": -504.23046875, "logps/rejected": -694.3775024414062, "loss": 0.4124, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9383466243743896, "rewards/margins": 1.8021043539047241, "rewards/rejected": -4.740450859069824, "step": 6110 }, { "epoch": 0.4, "grad_norm": 18.0, "learning_rate": 3.74686667811392e-06, "logits/chosen": -1.608520746231079, "logits/rejected": -1.3282487392425537, "logps/chosen": -547.75634765625, "logps/rejected": -664.78662109375, "loss": 0.4339, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.747250556945801, "rewards/margins": 1.506263017654419, "rewards/rejected": -4.253513336181641, "step": 6120 }, { "epoch": 0.4, "grad_norm": 6.34375, "learning_rate": 3.741914017753602e-06, "logits/chosen": -1.6058686971664429, "logits/rejected": -1.4757113456726074, "logps/chosen": -553.7803955078125, "logps/rejected": -768.9378662109375, "loss": 0.4408, "rewards/accuracies": 0.75, "rewards/chosen": -3.0144965648651123, "rewards/margins": 2.0072708129882812, "rewards/rejected": -5.021767616271973, "step": 6130 }, { "epoch": 0.4, "grad_norm": 12.0625, "learning_rate": 3.7369548780193305e-06, "logits/chosen": -1.3685169219970703, "logits/rejected": -1.1551145315170288, "logps/chosen": -538.6937255859375, "logps/rejected": -646.2117309570312, "loss": 0.6227, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.167346477508545, "rewards/margins": 1.1783454418182373, "rewards/rejected": -4.345692157745361, "step": 6140 }, { "epoch": 0.4, "grad_norm": 36.25, "learning_rate": 3.731989284784171e-06, "logits/chosen": -1.3571865558624268, "logits/rejected": -0.886684775352478, "logps/chosen": -565.2444458007812, "logps/rejected": -750.3971557617188, "loss": 0.5326, "rewards/accuracies": 0.75, "rewards/chosen": -3.5890090465545654, "rewards/margins": 1.5495522022247314, "rewards/rejected": -5.138561248779297, "step": 6150 }, { "epoch": 0.4, "grad_norm": 45.0, "learning_rate": 3.7270172639548574e-06, "logits/chosen": -1.5400210618972778, "logits/rejected": -1.2033289670944214, "logps/chosen": -650.5201416015625, "logps/rejected": -846.1435546875, "loss": 0.6559, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.891202449798584, "rewards/margins": 1.8259731531143188, "rewards/rejected": -5.717175483703613, "step": 6160 }, { "epoch": 0.4, "grad_norm": 12.3125, "learning_rate": 3.722038841471656e-06, "logits/chosen": -1.1576217412948608, "logits/rejected": -1.0225852727890015, "logps/chosen": -659.4063720703125, "logps/rejected": -835.1433715820312, "loss": 0.5885, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.057988166809082, "rewards/margins": 1.5210652351379395, "rewards/rejected": -5.579052925109863, "step": 6170 }, { "epoch": 0.4, "grad_norm": 34.5, "learning_rate": 3.717054043308236e-06, "logits/chosen": -1.4750564098358154, "logits/rejected": -1.1705251932144165, "logps/chosen": -708.8960571289062, "logps/rejected": -795.9318237304688, "loss": 0.8075, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.238829612731934, "rewards/margins": 1.227535367012024, "rewards/rejected": -5.466364860534668, "step": 6180 }, { "epoch": 0.41, "grad_norm": 15.8125, "learning_rate": 3.7120628954715256e-06, "logits/chosen": -1.6064462661743164, "logits/rejected": -1.3155604600906372, "logps/chosen": -639.2509765625, "logps/rejected": -729.9886474609375, "loss": 0.4362, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.233790636062622, "rewards/margins": 1.680890679359436, "rewards/rejected": -4.914681434631348, "step": 6190 }, { "epoch": 0.41, "grad_norm": 13.375, "learning_rate": 3.7070654240015835e-06, "logits/chosen": -1.578619122505188, "logits/rejected": -1.4169752597808838, "logps/chosen": -476.73944091796875, "logps/rejected": -664.7938232421875, "loss": 0.4555, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7442374229431152, "rewards/margins": 1.6113630533218384, "rewards/rejected": -4.355600357055664, "step": 6200 }, { "epoch": 0.41, "eval_logits/chosen": -1.6070528030395508, "eval_logits/rejected": -1.3785420656204224, "eval_logps/chosen": -552.6166381835938, "eval_logps/rejected": -677.3169555664062, "eval_loss": 0.5346325039863586, "eval_rewards/accuracies": 0.7325000166893005, "eval_rewards/chosen": -2.8799703121185303, "eval_rewards/margins": 1.4474828243255615, "eval_rewards/rejected": -4.327452659606934, "eval_runtime": 1081.9528, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 6200 }, { "epoch": 0.41, "grad_norm": 26.5, "learning_rate": 3.7020616549714605e-06, "logits/chosen": -1.5820451974868774, "logits/rejected": -1.4527888298034668, "logps/chosen": -556.2267456054688, "logps/rejected": -595.763427734375, "loss": 0.5252, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.731945276260376, "rewards/margins": 0.9635213613510132, "rewards/rejected": -3.6954665184020996, "step": 6210 }, { "epoch": 0.41, "grad_norm": 40.75, "learning_rate": 3.6970516144870623e-06, "logits/chosen": -1.4573991298675537, "logits/rejected": -1.5476467609405518, "logps/chosen": -636.523193359375, "logps/rejected": -731.8717651367188, "loss": 0.6086, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6467158794403076, "rewards/margins": 1.0786383152008057, "rewards/rejected": -4.725354194641113, "step": 6220 }, { "epoch": 0.41, "grad_norm": 12.6875, "learning_rate": 3.6920353286870142e-06, "logits/chosen": -1.6345205307006836, "logits/rejected": -1.3034451007843018, "logps/chosen": -642.0438232421875, "logps/rejected": -865.1248168945312, "loss": 0.5246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.656198024749756, "rewards/margins": 2.057004928588867, "rewards/rejected": -5.713203430175781, "step": 6230 }, { "epoch": 0.41, "grad_norm": 20.0, "learning_rate": 3.687012823742526e-06, "logits/chosen": -1.5325613021850586, "logits/rejected": -1.4991018772125244, "logps/chosen": -514.2377319335938, "logps/rejected": -645.3177490234375, "loss": 0.6701, "rewards/accuracies": 0.625, "rewards/chosen": -2.8916468620300293, "rewards/margins": 0.9674122929573059, "rewards/rejected": -3.8590590953826904, "step": 6240 }, { "epoch": 0.41, "grad_norm": 24.875, "learning_rate": 3.681984125857254e-06, "logits/chosen": -1.4720264673233032, "logits/rejected": -1.5648610591888428, "logps/chosen": -534.4149169921875, "logps/rejected": -682.79736328125, "loss": 0.5464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.929445505142212, "rewards/margins": 1.2244226932525635, "rewards/rejected": -4.153868198394775, "step": 6250 }, { "epoch": 0.41, "grad_norm": 67.0, "learning_rate": 3.6769492612671637e-06, "logits/chosen": -1.4158203601837158, "logits/rejected": -1.0097553730010986, "logps/chosen": -567.1195068359375, "logps/rejected": -717.0623168945312, "loss": 0.7972, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.345048427581787, "rewards/margins": 1.7322750091552734, "rewards/rejected": -5.0773234367370605, "step": 6260 }, { "epoch": 0.41, "grad_norm": 12.0, "learning_rate": 3.671908256240395e-06, "logits/chosen": -1.6651582717895508, "logits/rejected": -1.4031132459640503, "logps/chosen": -535.5482177734375, "logps/rejected": -580.1553344726562, "loss": 0.5755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.532740831375122, "rewards/margins": 1.1729122400283813, "rewards/rejected": -3.705652952194214, "step": 6270 }, { "epoch": 0.41, "grad_norm": 6.21875, "learning_rate": 3.6668611370771223e-06, "logits/chosen": -1.782926321029663, "logits/rejected": -1.753544807434082, "logps/chosen": -466.7909240722656, "logps/rejected": -560.46533203125, "loss": 0.505, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3398568630218506, "rewards/margins": 1.0746307373046875, "rewards/rejected": -3.414487838745117, "step": 6280 }, { "epoch": 0.41, "grad_norm": 22.375, "learning_rate": 3.661807930109422e-06, "logits/chosen": -1.7409778833389282, "logits/rejected": -1.3587533235549927, "logps/chosen": -532.5238037109375, "logps/rejected": -669.7543334960938, "loss": 0.3975, "rewards/accuracies": 0.75, "rewards/chosen": -2.3641557693481445, "rewards/margins": 1.5506665706634521, "rewards/rejected": -3.9148223400115967, "step": 6290 }, { "epoch": 0.41, "grad_norm": 6.28125, "learning_rate": 3.6567486617011284e-06, "logits/chosen": -1.7018816471099854, "logits/rejected": -1.5539731979370117, "logps/chosen": -470.53546142578125, "logps/rejected": -603.2235717773438, "loss": 0.328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.051143169403076, "rewards/margins": 1.8699243068695068, "rewards/rejected": -3.921067476272583, "step": 6300 }, { "epoch": 0.41, "eval_logits/chosen": -1.6985998153686523, "eval_logits/rejected": -1.4531536102294922, "eval_logps/chosen": -517.8211669921875, "eval_logps/rejected": -646.3101196289062, "eval_loss": 0.5238416790962219, "eval_rewards/accuracies": 0.7300000190734863, "eval_rewards/chosen": -2.532015562057495, "eval_rewards/margins": 1.4853687286376953, "eval_rewards/rejected": -4.0173845291137695, "eval_runtime": 1082.1756, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 6300 }, { "epoch": 0.41, "grad_norm": 21.75, "learning_rate": 3.6516833582477018e-06, "logits/chosen": -1.4238166809082031, "logits/rejected": -1.4345602989196777, "logps/chosen": -565.3036499023438, "logps/rejected": -767.2711791992188, "loss": 0.3338, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.494722843170166, "rewards/margins": 2.421818256378174, "rewards/rejected": -4.91654109954834, "step": 6310 }, { "epoch": 0.41, "grad_norm": 26.75, "learning_rate": 3.64661204617609e-06, "logits/chosen": -1.4988741874694824, "logits/rejected": -1.5717788934707642, "logps/chosen": -578.56787109375, "logps/rejected": -748.5559692382812, "loss": 0.6146, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0843212604522705, "rewards/margins": 1.6065104007720947, "rewards/rejected": -4.690831184387207, "step": 6320 }, { "epoch": 0.41, "grad_norm": 8.875, "learning_rate": 3.641534751944587e-06, "logits/chosen": -1.1790145635604858, "logits/rejected": -0.9592960476875305, "logps/chosen": -694.0, "logps/rejected": -936.046875, "loss": 0.3093, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7745890617370605, "rewards/margins": 2.649610996246338, "rewards/rejected": -6.424200534820557, "step": 6330 }, { "epoch": 0.41, "grad_norm": 19.25, "learning_rate": 3.6364515020426984e-06, "logits/chosen": -1.2898048162460327, "logits/rejected": -0.792587161064148, "logps/chosen": -702.462890625, "logps/rejected": -855.5587158203125, "loss": 0.6209, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.526650905609131, "rewards/margins": 1.7273727655410767, "rewards/rejected": -6.254024028778076, "step": 6340 }, { "epoch": 0.42, "grad_norm": 41.25, "learning_rate": 3.631362322991001e-06, "logits/chosen": -1.1884403228759766, "logits/rejected": -0.4388814866542816, "logps/chosen": -766.160888671875, "logps/rejected": -840.6085815429688, "loss": 1.0845, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.739851474761963, "rewards/margins": 1.5970858335494995, "rewards/rejected": -6.336936950683594, "step": 6350 }, { "epoch": 0.42, "grad_norm": 25.75, "learning_rate": 3.626267241341007e-06, "logits/chosen": -1.202844262123108, "logits/rejected": -0.4009116291999817, "logps/chosen": -644.9981079101562, "logps/rejected": -789.4498901367188, "loss": 0.4787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.031848907470703, "rewards/margins": 2.0493080615997314, "rewards/rejected": -6.081157207489014, "step": 6360 }, { "epoch": 0.42, "grad_norm": 19.875, "learning_rate": 3.621166283675023e-06, "logits/chosen": -1.449704647064209, "logits/rejected": -1.191806435585022, "logps/chosen": -629.6187744140625, "logps/rejected": -842.9913940429688, "loss": 0.4715, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.949559450149536, "rewards/margins": 2.143256425857544, "rewards/rejected": -6.092815399169922, "step": 6370 }, { "epoch": 0.42, "grad_norm": 24.125, "learning_rate": 3.6160594766060118e-06, "logits/chosen": -1.467877745628357, "logits/rejected": -1.1554393768310547, "logps/chosen": -628.1842041015625, "logps/rejected": -755.1007690429688, "loss": 0.5117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.5091500282287598, "rewards/margins": 1.7960045337677002, "rewards/rejected": -5.305155277252197, "step": 6380 }, { "epoch": 0.42, "grad_norm": 30.25, "learning_rate": 3.610946846777455e-06, "logits/chosen": -1.4568917751312256, "logits/rejected": -1.4761440753936768, "logps/chosen": -692.5374145507812, "logps/rejected": -762.5696411132812, "loss": 0.7583, "rewards/accuracies": 0.625, "rewards/chosen": -3.34906005859375, "rewards/margins": 1.2803717851638794, "rewards/rejected": -4.62943172454834, "step": 6390 }, { "epoch": 0.42, "grad_norm": 10.0625, "learning_rate": 3.6058284208632134e-06, "logits/chosen": -1.6184349060058594, "logits/rejected": -0.9301021695137024, "logps/chosen": -567.7501220703125, "logps/rejected": -619.2005615234375, "loss": 0.6362, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.800585985183716, "rewards/margins": 1.2465732097625732, "rewards/rejected": -4.047158241271973, "step": 6400 }, { "epoch": 0.42, "eval_logits/chosen": -1.475805640220642, "eval_logits/rejected": -1.1700021028518677, "eval_logps/chosen": -567.556884765625, "eval_logps/rejected": -702.3619995117188, "eval_loss": 0.5240737199783325, "eval_rewards/accuracies": 0.7350000143051147, "eval_rewards/chosen": -3.029372453689575, "eval_rewards/margins": 1.5485310554504395, "eval_rewards/rejected": -4.5779032707214355, "eval_runtime": 1082.9747, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 6400 }, { "epoch": 0.42, "grad_norm": 4.03125, "learning_rate": 3.6007042255673847e-06, "logits/chosen": -1.3820555210113525, "logits/rejected": -1.0473482608795166, "logps/chosen": -544.0860595703125, "logps/rejected": -649.0595703125, "loss": 0.5215, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9713704586029053, "rewards/margins": 1.450679898262024, "rewards/rejected": -4.422050952911377, "step": 6410 }, { "epoch": 0.42, "grad_norm": 8.375, "learning_rate": 3.5955742876241696e-06, "logits/chosen": -1.3970824480056763, "logits/rejected": -1.0970070362091064, "logps/chosen": -537.3336791992188, "logps/rejected": -655.501953125, "loss": 0.4623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.101969003677368, "rewards/margins": 1.3595247268676758, "rewards/rejected": -4.461493492126465, "step": 6420 }, { "epoch": 0.42, "grad_norm": 16.25, "learning_rate": 3.5904386337977287e-06, "logits/chosen": -1.207366704940796, "logits/rejected": -1.1440770626068115, "logps/chosen": -589.4725341796875, "logps/rejected": -772.9846801757812, "loss": 0.3992, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3163509368896484, "rewards/margins": 1.8100507259368896, "rewards/rejected": -5.126400947570801, "step": 6430 }, { "epoch": 0.42, "grad_norm": 20.75, "learning_rate": 3.5852972908820445e-06, "logits/chosen": -1.5347782373428345, "logits/rejected": -0.8615711331367493, "logps/chosen": -724.7694091796875, "logps/rejected": -743.4436645507812, "loss": 0.7791, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.9657223224639893, "rewards/margins": 1.147249460220337, "rewards/rejected": -5.112971305847168, "step": 6440 }, { "epoch": 0.42, "grad_norm": 13.875, "learning_rate": 3.5801502857007787e-06, "logits/chosen": -1.7873079776763916, "logits/rejected": -1.0365417003631592, "logps/chosen": -558.655029296875, "logps/rejected": -759.9094848632812, "loss": 0.3587, "rewards/accuracies": 0.875, "rewards/chosen": -2.922118663787842, "rewards/margins": 2.2328388690948486, "rewards/rejected": -5.1549577713012695, "step": 6450 }, { "epoch": 0.42, "grad_norm": 55.5, "learning_rate": 3.5749976451071377e-06, "logits/chosen": -1.1307766437530518, "logits/rejected": -0.8925553560256958, "logps/chosen": -540.0488891601562, "logps/rejected": -649.8804931640625, "loss": 0.668, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.295483350753784, "rewards/margins": 0.9265762567520142, "rewards/rejected": -4.222059726715088, "step": 6460 }, { "epoch": 0.42, "grad_norm": 10.9375, "learning_rate": 3.5698393959837267e-06, "logits/chosen": -1.5463672876358032, "logits/rejected": -0.4483901858329773, "logps/chosen": -580.3298950195312, "logps/rejected": -738.4647216796875, "loss": 0.4245, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.016369342803955, "rewards/margins": 2.370147228240967, "rewards/rejected": -5.386516094207764, "step": 6470 }, { "epoch": 0.42, "grad_norm": 16.125, "learning_rate": 3.5646755652424125e-06, "logits/chosen": -1.4042866230010986, "logits/rejected": -1.0103751420974731, "logps/chosen": -648.745849609375, "logps/rejected": -681.9848022460938, "loss": 0.6245, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4761767387390137, "rewards/margins": 0.9593431353569031, "rewards/rejected": -4.435520172119141, "step": 6480 }, { "epoch": 0.42, "grad_norm": 7.46875, "learning_rate": 3.559506179824184e-06, "logits/chosen": -1.4322901964187622, "logits/rejected": -1.2974274158477783, "logps/chosen": -506.22991943359375, "logps/rejected": -648.1929931640625, "loss": 0.4794, "rewards/accuracies": 0.75, "rewards/chosen": -2.8861899375915527, "rewards/margins": 1.4032824039459229, "rewards/rejected": -4.289472579956055, "step": 6490 }, { "epoch": 0.43, "grad_norm": 24.5, "learning_rate": 3.554331266699007e-06, "logits/chosen": -1.5026600360870361, "logits/rejected": -0.834079384803772, "logps/chosen": -739.8030395507812, "logps/rejected": -840.1594848632812, "loss": 0.3597, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.8772406578063965, "rewards/margins": 1.7907218933105469, "rewards/rejected": -5.667962551116943, "step": 6500 }, { "epoch": 0.43, "eval_logits/chosen": -1.2830278873443604, "eval_logits/rejected": -0.9547072649002075, "eval_logps/chosen": -627.9058837890625, "eval_logps/rejected": -779.1708374023438, "eval_loss": 0.5416303277015686, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -3.6328623294830322, "eval_rewards/margins": 1.713128924369812, "eval_rewards/rejected": -5.345991134643555, "eval_runtime": 1085.8109, "eval_samples_per_second": 1.842, "eval_steps_per_second": 1.842, "step": 6500 }, { "epoch": 0.43, "grad_norm": 14.5625, "learning_rate": 3.549150852865691e-06, "logits/chosen": -1.4174201488494873, "logits/rejected": -0.9821884036064148, "logps/chosen": -596.0966796875, "logps/rejected": -810.7664794921875, "loss": 0.3821, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.1721351146698, "rewards/margins": 2.4154422283172607, "rewards/rejected": -5.587576866149902, "step": 6510 }, { "epoch": 0.43, "grad_norm": 19.125, "learning_rate": 3.5439649653517416e-06, "logits/chosen": -1.0044896602630615, "logits/rejected": -0.8298678398132324, "logps/chosen": -630.4362182617188, "logps/rejected": -821.9078979492188, "loss": 0.4512, "rewards/accuracies": 0.75, "rewards/chosen": -4.017738342285156, "rewards/margins": 2.006444215774536, "rewards/rejected": -6.0241827964782715, "step": 6520 }, { "epoch": 0.43, "grad_norm": 154.0, "learning_rate": 3.538773631213221e-06, "logits/chosen": -1.3099727630615234, "logits/rejected": -0.9936957359313965, "logps/chosen": -710.7952880859375, "logps/rejected": -934.7581176757812, "loss": 0.4626, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.285350322723389, "rewards/margins": 2.6298892498016357, "rewards/rejected": -6.915240287780762, "step": 6530 }, { "epoch": 0.43, "grad_norm": 104.5, "learning_rate": 3.5335768775346097e-06, "logits/chosen": -1.1707427501678467, "logits/rejected": -0.9075485467910767, "logps/chosen": -655.3283081054688, "logps/rejected": -953.3106689453125, "loss": 0.7083, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.04822301864624, "rewards/margins": 2.186408519744873, "rewards/rejected": -6.2346320152282715, "step": 6540 }, { "epoch": 0.43, "grad_norm": 24.25, "learning_rate": 3.5283747314286634e-06, "logits/chosen": -1.0472099781036377, "logits/rejected": -1.0442800521850586, "logps/chosen": -556.478271484375, "logps/rejected": -801.5211181640625, "loss": 0.3114, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3794121742248535, "rewards/margins": 2.2726073265075684, "rewards/rejected": -5.6520185470581055, "step": 6550 }, { "epoch": 0.43, "grad_norm": 21.0, "learning_rate": 3.523167220036269e-06, "logits/chosen": -1.0046885013580322, "logits/rejected": -1.0009300708770752, "logps/chosen": -645.7764892578125, "logps/rejected": -814.2684936523438, "loss": 0.4825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.634432315826416, "rewards/margins": 1.816901445388794, "rewards/rejected": -5.451333045959473, "step": 6560 }, { "epoch": 0.43, "grad_norm": 44.0, "learning_rate": 3.5179543705263097e-06, "logits/chosen": -1.4909327030181885, "logits/rejected": -1.301231026649475, "logps/chosen": -697.2801513671875, "logps/rejected": -772.10791015625, "loss": 0.7437, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.118506908416748, "rewards/margins": 1.2873986959457397, "rewards/rejected": -5.405905723571777, "step": 6570 }, { "epoch": 0.43, "grad_norm": 8.9375, "learning_rate": 3.512736210095514e-06, "logits/chosen": -1.1708476543426514, "logits/rejected": -1.489991307258606, "logps/chosen": -639.1509399414062, "logps/rejected": -889.8771362304688, "loss": 0.6456, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.7295620441436768, "rewards/margins": 1.9820263385772705, "rewards/rejected": -5.711588382720947, "step": 6580 }, { "epoch": 0.43, "grad_norm": 50.5, "learning_rate": 3.5075127659683216e-06, "logits/chosen": -1.3868228197097778, "logits/rejected": -0.8212188482284546, "logps/chosen": -603.152587890625, "logps/rejected": -779.3824462890625, "loss": 0.5606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.47424054145813, "rewards/margins": 1.6141815185546875, "rewards/rejected": -5.0884222984313965, "step": 6590 }, { "epoch": 0.43, "grad_norm": 5.625, "learning_rate": 3.5022840653967392e-06, "logits/chosen": -1.5098793506622314, "logits/rejected": -1.0602381229400635, "logps/chosen": -584.7099609375, "logps/rejected": -695.9259643554688, "loss": 0.5852, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.113436222076416, "rewards/margins": 1.4533684253692627, "rewards/rejected": -4.566803932189941, "step": 6600 }, { "epoch": 0.43, "eval_logits/chosen": -1.4796687364578247, "eval_logits/rejected": -1.180670142173767, "eval_logps/chosen": -585.2349853515625, "eval_logps/rejected": -722.522705078125, "eval_loss": 0.5490060448646545, "eval_rewards/accuracies": 0.7289999723434448, "eval_rewards/chosen": -3.206153631210327, "eval_rewards/margins": 1.5733569860458374, "eval_rewards/rejected": -4.779510974884033, "eval_runtime": 1082.2183, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 6600 }, { "epoch": 0.43, "grad_norm": 64.5, "learning_rate": 3.497050135660196e-06, "logits/chosen": -1.1259987354278564, "logits/rejected": -1.2642868757247925, "logps/chosen": -534.7787475585938, "logps/rejected": -719.9671630859375, "loss": 0.634, "rewards/accuracies": 0.75, "rewards/chosen": -3.1369824409484863, "rewards/margins": 1.3677839040756226, "rewards/rejected": -4.50476598739624, "step": 6610 }, { "epoch": 0.43, "grad_norm": 34.75, "learning_rate": 3.4918110040654035e-06, "logits/chosen": -1.3131985664367676, "logits/rejected": -1.0110490322113037, "logps/chosen": -523.1505737304688, "logps/rejected": -689.4434204101562, "loss": 0.6517, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.0824968814849854, "rewards/margins": 1.3532116413116455, "rewards/rejected": -4.435708522796631, "step": 6620 }, { "epoch": 0.43, "grad_norm": 54.0, "learning_rate": 3.4865666979462133e-06, "logits/chosen": -1.265448808670044, "logits/rejected": -1.0662140846252441, "logps/chosen": -616.4017333984375, "logps/rejected": -733.4095458984375, "loss": 0.6586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.8197052478790283, "rewards/margins": 1.1563078165054321, "rewards/rejected": -4.976012229919434, "step": 6630 }, { "epoch": 0.43, "grad_norm": 36.25, "learning_rate": 3.481317244663472e-06, "logits/chosen": -1.1698582172393799, "logits/rejected": -0.9577374458312988, "logps/chosen": -529.6564331054688, "logps/rejected": -763.5572509765625, "loss": 0.4136, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.349369525909424, "rewards/margins": 2.193840503692627, "rewards/rejected": -5.543210029602051, "step": 6640 }, { "epoch": 0.44, "grad_norm": 26.0, "learning_rate": 3.476062671604882e-06, "logits/chosen": -1.5927263498306274, "logits/rejected": -1.235026478767395, "logps/chosen": -702.8513793945312, "logps/rejected": -829.373046875, "loss": 0.471, "rewards/accuracies": 0.75, "rewards/chosen": -3.929624080657959, "rewards/margins": 1.867884874343872, "rewards/rejected": -5.797508716583252, "step": 6650 }, { "epoch": 0.44, "grad_norm": 31.375, "learning_rate": 3.470803006184858e-06, "logits/chosen": -0.9971973299980164, "logits/rejected": -0.9346476793289185, "logps/chosen": -790.8199462890625, "logps/rejected": -936.06494140625, "loss": 0.8579, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.757009983062744, "rewards/margins": 1.7614641189575195, "rewards/rejected": -6.5184736251831055, "step": 6660 }, { "epoch": 0.44, "grad_norm": 61.5, "learning_rate": 3.4655382758443793e-06, "logits/chosen": -1.488431692123413, "logits/rejected": -0.8016947507858276, "logps/chosen": -658.55029296875, "logps/rejected": -701.2600708007812, "loss": 0.8889, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.891911745071411, "rewards/margins": 0.9751306772232056, "rewards/rejected": -4.867042064666748, "step": 6670 }, { "epoch": 0.44, "grad_norm": 12.9375, "learning_rate": 3.4602685080508523e-06, "logits/chosen": -1.017852544784546, "logits/rejected": -1.1269638538360596, "logps/chosen": -690.8931884765625, "logps/rejected": -848.4417724609375, "loss": 0.4095, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.140925407409668, "rewards/margins": 1.7078224420547485, "rewards/rejected": -5.848748207092285, "step": 6680 }, { "epoch": 0.44, "grad_norm": 8.9375, "learning_rate": 3.4549937302979653e-06, "logits/chosen": -1.5259836912155151, "logits/rejected": -0.9117225408554077, "logps/chosen": -657.3452758789062, "logps/rejected": -743.4940185546875, "loss": 0.5068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.5983433723449707, "rewards/margins": 1.3942855596542358, "rewards/rejected": -4.992629051208496, "step": 6690 }, { "epoch": 0.44, "grad_norm": 20.0, "learning_rate": 3.4497139701055427e-06, "logits/chosen": -0.9411404728889465, "logits/rejected": -1.140500783920288, "logps/chosen": -539.5113525390625, "logps/rejected": -770.2239379882812, "loss": 0.43, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.134148120880127, "rewards/margins": 2.20993971824646, "rewards/rejected": -5.344087600708008, "step": 6700 }, { "epoch": 0.44, "eval_logits/chosen": -1.3082882165908813, "eval_logits/rejected": -1.016937017440796, "eval_logps/chosen": -667.5021362304688, "eval_logps/rejected": -837.1741943359375, "eval_loss": 0.5775753259658813, "eval_rewards/accuracies": 0.7294999957084656, "eval_rewards/chosen": -4.028825283050537, "eval_rewards/margins": 1.8972002267837524, "eval_rewards/rejected": -5.926025390625, "eval_runtime": 1082.1839, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 6700 }, { "epoch": 0.44, "grad_norm": 55.75, "learning_rate": 3.4444292550194063e-06, "logits/chosen": -0.9609133005142212, "logits/rejected": -1.1071789264678955, "logps/chosen": -610.5416259765625, "logps/rejected": -817.0380249023438, "loss": 0.5704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.36357307434082, "rewards/margins": 1.4153515100479126, "rewards/rejected": -5.778923988342285, "step": 6710 }, { "epoch": 0.44, "grad_norm": 14.25, "learning_rate": 3.4391396126112282e-06, "logits/chosen": -1.5643399953842163, "logits/rejected": -1.0902221202850342, "logps/chosen": -740.6421508789062, "logps/rejected": -872.9747924804688, "loss": 0.4338, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.725341320037842, "rewards/margins": 1.9872252941131592, "rewards/rejected": -5.712567329406738, "step": 6720 }, { "epoch": 0.44, "grad_norm": 28.25, "learning_rate": 3.4338450704783854e-06, "logits/chosen": -1.4094915390014648, "logits/rejected": -0.9264295697212219, "logps/chosen": -638.4910888671875, "logps/rejected": -791.9893798828125, "loss": 0.4237, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.636519193649292, "rewards/margins": 1.9294960498809814, "rewards/rejected": -5.566015243530273, "step": 6730 }, { "epoch": 0.44, "grad_norm": 5.4375, "learning_rate": 3.428545656243821e-06, "logits/chosen": -1.4872112274169922, "logits/rejected": -1.38477623462677, "logps/chosen": -581.27978515625, "logps/rejected": -769.24169921875, "loss": 0.4277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.162022352218628, "rewards/margins": 2.3497231006622314, "rewards/rejected": -5.511745929718018, "step": 6740 }, { "epoch": 0.44, "grad_norm": 6.84375, "learning_rate": 3.423241397555893e-06, "logits/chosen": -1.1169841289520264, "logits/rejected": -1.2961478233337402, "logps/chosen": -573.9603271484375, "logps/rejected": -816.7371215820312, "loss": 0.4746, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6173348426818848, "rewards/margins": 1.950811743736267, "rewards/rejected": -5.568145751953125, "step": 6750 }, { "epoch": 0.44, "grad_norm": 64.5, "learning_rate": 3.4179323220882397e-06, "logits/chosen": -1.3705536127090454, "logits/rejected": -0.9015995860099792, "logps/chosen": -529.6578369140625, "logps/rejected": -793.2229614257812, "loss": 0.3038, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.2998898029327393, "rewards/margins": 2.602259397506714, "rewards/rejected": -5.902149200439453, "step": 6760 }, { "epoch": 0.44, "grad_norm": 33.5, "learning_rate": 3.4126184575396253e-06, "logits/chosen": -1.4175446033477783, "logits/rejected": -1.4479984045028687, "logps/chosen": -642.1771240234375, "logps/rejected": -739.0440673828125, "loss": 0.6868, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.7927658557891846, "rewards/margins": 1.2115180492401123, "rewards/rejected": -5.004284858703613, "step": 6770 }, { "epoch": 0.44, "grad_norm": 15.8125, "learning_rate": 3.407299831633802e-06, "logits/chosen": -1.7377325296401978, "logits/rejected": -1.3712141513824463, "logps/chosen": -664.529541015625, "logps/rejected": -780.9830932617188, "loss": 0.5562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5056424140930176, "rewards/margins": 1.4847978353500366, "rewards/rejected": -4.9904398918151855, "step": 6780 }, { "epoch": 0.44, "grad_norm": 108.5, "learning_rate": 3.401976472119361e-06, "logits/chosen": -1.4374101161956787, "logits/rejected": -0.9489856958389282, "logps/chosen": -662.2592163085938, "logps/rejected": -815.4446411132812, "loss": 0.6132, "rewards/accuracies": 0.75, "rewards/chosen": -3.831390857696533, "rewards/margins": 1.835818886756897, "rewards/rejected": -5.667209148406982, "step": 6790 }, { "epoch": 0.44, "grad_norm": 17.625, "learning_rate": 3.396648406769593e-06, "logits/chosen": -1.1313990354537964, "logits/rejected": -0.8804634213447571, "logps/chosen": -591.3135986328125, "logps/rejected": -772.510009765625, "loss": 0.4531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5680556297302246, "rewards/margins": 1.6709951162338257, "rewards/rejected": -5.23905086517334, "step": 6800 }, { "epoch": 0.44, "eval_logits/chosen": -1.504401683807373, "eval_logits/rejected": -1.2265653610229492, "eval_logps/chosen": -607.2781372070312, "eval_logps/rejected": -758.2288818359375, "eval_loss": 0.5667298436164856, "eval_rewards/accuracies": 0.7384999990463257, "eval_rewards/chosen": -3.4265854358673096, "eval_rewards/margins": 1.7099865674972534, "eval_rewards/rejected": -5.136571407318115, "eval_runtime": 1081.8459, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 6800 }, { "epoch": 0.45, "grad_norm": 19.0, "learning_rate": 3.3913156633823365e-06, "logits/chosen": -1.5500277280807495, "logits/rejected": -1.472732424736023, "logps/chosen": -591.5975341796875, "logps/rejected": -721.2940673828125, "loss": 0.5506, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.153607130050659, "rewards/margins": 1.3623026609420776, "rewards/rejected": -4.5159101486206055, "step": 6810 }, { "epoch": 0.45, "grad_norm": 36.25, "learning_rate": 3.38597826977984e-06, "logits/chosen": -1.354288935661316, "logits/rejected": -0.9966108202934265, "logps/chosen": -626.7858276367188, "logps/rejected": -755.902587890625, "loss": 0.5298, "rewards/accuracies": 0.75, "rewards/chosen": -3.75870943069458, "rewards/margins": 1.3729784488677979, "rewards/rejected": -5.131688117980957, "step": 6820 }, { "epoch": 0.45, "grad_norm": 26.125, "learning_rate": 3.380636253808612e-06, "logits/chosen": -1.2391928434371948, "logits/rejected": -1.2273128032684326, "logps/chosen": -644.2972412109375, "logps/rejected": -927.6041259765625, "loss": 0.308, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.9292054176330566, "rewards/margins": 2.3957486152648926, "rewards/rejected": -6.324954032897949, "step": 6830 }, { "epoch": 0.45, "grad_norm": 43.0, "learning_rate": 3.375289643339277e-06, "logits/chosen": -1.177965760231018, "logits/rejected": -0.6665647029876709, "logps/chosen": -672.32080078125, "logps/rejected": -906.7097778320312, "loss": 0.461, "rewards/accuracies": 0.75, "rewards/chosen": -4.4873247146606445, "rewards/margins": 2.0724503993988037, "rewards/rejected": -6.559775352478027, "step": 6840 }, { "epoch": 0.45, "grad_norm": 90.0, "learning_rate": 3.369938466266428e-06, "logits/chosen": -1.594396948814392, "logits/rejected": -1.5015779733657837, "logps/chosen": -644.0408935546875, "logps/rejected": -809.135986328125, "loss": 0.5651, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.7662017345428467, "rewards/margins": 1.526520013809204, "rewards/rejected": -5.292721748352051, "step": 6850 }, { "epoch": 0.45, "grad_norm": 43.5, "learning_rate": 3.3645827505084877e-06, "logits/chosen": -1.6889541149139404, "logits/rejected": -0.7493103742599487, "logps/chosen": -667.343505859375, "logps/rejected": -709.4847412109375, "loss": 0.5319, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.6486458778381348, "rewards/margins": 1.2506225109100342, "rewards/rejected": -4.89926815032959, "step": 6860 }, { "epoch": 0.45, "grad_norm": 53.5, "learning_rate": 3.359222524007555e-06, "logits/chosen": -1.7579923868179321, "logits/rejected": -1.434496521949768, "logps/chosen": -583.5917358398438, "logps/rejected": -792.00146484375, "loss": 0.5626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3357136249542236, "rewards/margins": 1.763891577720642, "rewards/rejected": -5.099604606628418, "step": 6870 }, { "epoch": 0.45, "grad_norm": 31.625, "learning_rate": 3.353857814729262e-06, "logits/chosen": -1.5431883335113525, "logits/rejected": -1.2958720922470093, "logps/chosen": -563.8482666015625, "logps/rejected": -652.4342041015625, "loss": 0.5778, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.098480701446533, "rewards/margins": 1.3689591884613037, "rewards/rejected": -4.467439651489258, "step": 6880 }, { "epoch": 0.45, "grad_norm": 7.1875, "learning_rate": 3.3484886506626312e-06, "logits/chosen": -1.4905251264572144, "logits/rejected": -1.4205820560455322, "logps/chosen": -523.4661254882812, "logps/rejected": -767.809326171875, "loss": 0.3002, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.6606967449188232, "rewards/margins": 2.4414374828338623, "rewards/rejected": -5.1021342277526855, "step": 6890 }, { "epoch": 0.45, "grad_norm": 7.8125, "learning_rate": 3.343115059819925e-06, "logits/chosen": -1.7407093048095703, "logits/rejected": -1.4658243656158447, "logps/chosen": -550.0237426757812, "logps/rejected": -661.7894287109375, "loss": 0.4527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.534741163253784, "rewards/margins": 1.6986202001571655, "rewards/rejected": -4.23336124420166, "step": 6900 }, { "epoch": 0.45, "eval_logits/chosen": -1.6318613290786743, "eval_logits/rejected": -1.3551714420318604, "eval_logps/chosen": -575.7308959960938, "eval_logps/rejected": -717.8849487304688, "eval_loss": 0.5577818751335144, "eval_rewards/accuracies": 0.7275000214576721, "eval_rewards/chosen": -3.111112356185913, "eval_rewards/margins": 1.6220200061798096, "eval_rewards/rejected": -4.733132362365723, "eval_runtime": 1084.62, "eval_samples_per_second": 1.844, "eval_steps_per_second": 1.844, "step": 6900 }, { "epoch": 0.45, "grad_norm": 13.25, "learning_rate": 3.3377370702365007e-06, "logits/chosen": -1.885558843612671, "logits/rejected": -1.4922212362289429, "logps/chosen": -651.3240356445312, "logps/rejected": -730.9512939453125, "loss": 0.6088, "rewards/accuracies": 0.625, "rewards/chosen": -3.504784345626831, "rewards/margins": 1.1132903099060059, "rewards/rejected": -4.618074893951416, "step": 6910 }, { "epoch": 0.45, "grad_norm": 7.1875, "learning_rate": 3.332354709970667e-06, "logits/chosen": -1.4840366840362549, "logits/rejected": -1.5410720109939575, "logps/chosen": -554.0747680664062, "logps/rejected": -743.5648193359375, "loss": 0.5585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1169445514678955, "rewards/margins": 1.6109302043914795, "rewards/rejected": -4.727874279022217, "step": 6920 }, { "epoch": 0.45, "grad_norm": 16.5, "learning_rate": 3.3269680071035347e-06, "logits/chosen": -1.4355019330978394, "logits/rejected": -1.1574844121932983, "logps/chosen": -619.2803955078125, "logps/rejected": -750.0830078125, "loss": 0.5545, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.173475742340088, "rewards/margins": 1.8135722875595093, "rewards/rejected": -4.9870476722717285, "step": 6930 }, { "epoch": 0.45, "grad_norm": 13.5625, "learning_rate": 3.3215769897388706e-06, "logits/chosen": -1.953036904335022, "logits/rejected": -1.3070110082626343, "logps/chosen": -622.0767822265625, "logps/rejected": -782.9337768554688, "loss": 0.3824, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.19659423828125, "rewards/margins": 2.1932530403137207, "rewards/rejected": -5.389847278594971, "step": 6940 }, { "epoch": 0.45, "grad_norm": 22.125, "learning_rate": 3.316181686002951e-06, "logits/chosen": -1.4782679080963135, "logits/rejected": -1.2861192226409912, "logps/chosen": -656.7242431640625, "logps/rejected": -835.3561401367188, "loss": 0.4215, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.289233684539795, "rewards/margins": 2.035557270050049, "rewards/rejected": -5.324791431427002, "step": 6950 }, { "epoch": 0.46, "grad_norm": 17.375, "learning_rate": 3.310782124044416e-06, "logits/chosen": -1.7036292552947998, "logits/rejected": -1.4331004619598389, "logps/chosen": -611.147216796875, "logps/rejected": -693.2813720703125, "loss": 0.6484, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.492359161376953, "rewards/margins": 1.225664496421814, "rewards/rejected": -4.718024253845215, "step": 6960 }, { "epoch": 0.46, "grad_norm": 14.875, "learning_rate": 3.3053783320341226e-06, "logits/chosen": -1.8087949752807617, "logits/rejected": -1.5012948513031006, "logps/chosen": -564.1429443359375, "logps/rejected": -799.9796142578125, "loss": 0.421, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8972315788269043, "rewards/margins": 2.337704658508301, "rewards/rejected": -5.234936237335205, "step": 6970 }, { "epoch": 0.46, "grad_norm": 8.0625, "learning_rate": 3.299970338164995e-06, "logits/chosen": -1.683707594871521, "logits/rejected": -1.4468406438827515, "logps/chosen": -579.1633911132812, "logps/rejected": -695.1063842773438, "loss": 0.5756, "rewards/accuracies": 0.75, "rewards/chosen": -2.9984469413757324, "rewards/margins": 1.4543402194976807, "rewards/rejected": -4.452786922454834, "step": 6980 }, { "epoch": 0.46, "grad_norm": 17.5, "learning_rate": 3.2945581706518815e-06, "logits/chosen": -1.534862756729126, "logits/rejected": -1.247426152229309, "logps/chosen": -568.373779296875, "logps/rejected": -719.7665405273438, "loss": 0.4711, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1793017387390137, "rewards/margins": 1.5431853532791138, "rewards/rejected": -4.722487449645996, "step": 6990 }, { "epoch": 0.46, "grad_norm": 24.875, "learning_rate": 3.2891418577314037e-06, "logits/chosen": -1.5055022239685059, "logits/rejected": -1.2874605655670166, "logps/chosen": -581.0638427734375, "logps/rejected": -703.4862670898438, "loss": 0.5708, "rewards/accuracies": 0.75, "rewards/chosen": -3.4877593517303467, "rewards/margins": 1.5206756591796875, "rewards/rejected": -5.008435249328613, "step": 7000 }, { "epoch": 0.46, "eval_logits/chosen": -1.6089717149734497, "eval_logits/rejected": -1.3404697179794312, "eval_logps/chosen": -587.5587158203125, "eval_logps/rejected": -724.8992919921875, "eval_loss": 0.5355511903762817, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -3.22939133644104, "eval_rewards/margins": 1.5738849639892578, "eval_rewards/rejected": -4.803276538848877, "eval_runtime": 1082.8945, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 7000 }, { "epoch": 0.46, "grad_norm": 26.5, "learning_rate": 3.283721427661813e-06, "logits/chosen": -1.0820029973983765, "logits/rejected": -1.039508581161499, "logps/chosen": -574.6074829101562, "logps/rejected": -809.5228881835938, "loss": 0.4869, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.05010986328125, "rewards/margins": 2.056291103363037, "rewards/rejected": -5.106400489807129, "step": 7010 }, { "epoch": 0.46, "grad_norm": 50.75, "learning_rate": 3.2782969087228383e-06, "logits/chosen": -1.5877466201782227, "logits/rejected": -1.4500097036361694, "logps/chosen": -638.3268432617188, "logps/rejected": -757.1890869140625, "loss": 0.47, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.307526111602783, "rewards/margins": 1.9393641948699951, "rewards/rejected": -5.246890068054199, "step": 7020 }, { "epoch": 0.46, "grad_norm": 14.6875, "learning_rate": 3.2728683292155426e-06, "logits/chosen": -1.6411584615707397, "logits/rejected": -0.8621476888656616, "logps/chosen": -691.7325439453125, "logps/rejected": -808.1011962890625, "loss": 0.4499, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.78266978263855, "rewards/margins": 1.8511171340942383, "rewards/rejected": -5.633786678314209, "step": 7030 }, { "epoch": 0.46, "grad_norm": 32.0, "learning_rate": 3.267435717462175e-06, "logits/chosen": -1.7918535470962524, "logits/rejected": -1.1310433149337769, "logps/chosen": -612.1990966796875, "logps/rejected": -756.9449462890625, "loss": 0.4861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.502836227416992, "rewards/margins": 1.906011939048767, "rewards/rejected": -5.408847808837891, "step": 7040 }, { "epoch": 0.46, "grad_norm": 38.0, "learning_rate": 3.2619991018060195e-06, "logits/chosen": -1.2215138673782349, "logits/rejected": -1.1000020503997803, "logps/chosen": -694.2501831054688, "logps/rejected": -824.8701171875, "loss": 0.6547, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.0984110832214355, "rewards/margins": 1.2744941711425781, "rewards/rejected": -5.372905254364014, "step": 7050 }, { "epoch": 0.46, "grad_norm": 59.75, "learning_rate": 3.2565585106112507e-06, "logits/chosen": -1.452492117881775, "logits/rejected": -1.0878585577011108, "logps/chosen": -656.5543212890625, "logps/rejected": -750.7845458984375, "loss": 0.7215, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.011723518371582, "rewards/margins": 1.2090781927108765, "rewards/rejected": -5.22080135345459, "step": 7060 }, { "epoch": 0.46, "grad_norm": 42.75, "learning_rate": 3.2511139722627843e-06, "logits/chosen": -1.3334091901779175, "logits/rejected": -1.4556140899658203, "logps/chosen": -667.8685302734375, "logps/rejected": -755.2457275390625, "loss": 0.5626, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.635791778564453, "rewards/margins": 1.4075740575790405, "rewards/rejected": -5.043365478515625, "step": 7070 }, { "epoch": 0.46, "grad_norm": 7.96875, "learning_rate": 3.24566551516613e-06, "logits/chosen": -1.5350252389907837, "logits/rejected": -0.7005395293235779, "logps/chosen": -644.5778198242188, "logps/rejected": -764.1461181640625, "loss": 0.4907, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6552436351776123, "rewards/margins": 2.117025852203369, "rewards/rejected": -5.772270202636719, "step": 7080 }, { "epoch": 0.46, "grad_norm": 50.0, "learning_rate": 3.2402131677472414e-06, "logits/chosen": -1.477556586265564, "logits/rejected": -1.0628242492675781, "logps/chosen": -672.3294067382812, "logps/rejected": -763.3955078125, "loss": 0.5036, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9903132915496826, "rewards/margins": 1.2019013166427612, "rewards/rejected": -5.1922149658203125, "step": 7090 }, { "epoch": 0.46, "grad_norm": 6.21875, "learning_rate": 3.23475695845237e-06, "logits/chosen": -1.5530791282653809, "logits/rejected": -1.2321417331695557, "logps/chosen": -658.9381713867188, "logps/rejected": -699.3362426757812, "loss": 0.6367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7786331176757812, "rewards/margins": 1.0995142459869385, "rewards/rejected": -4.878147125244141, "step": 7100 }, { "epoch": 0.46, "eval_logits/chosen": -1.54840886592865, "eval_logits/rejected": -1.2864742279052734, "eval_logps/chosen": -630.9788818359375, "eval_logps/rejected": -765.6870727539062, "eval_loss": 0.5203990340232849, "eval_rewards/accuracies": 0.7390000224113464, "eval_rewards/chosen": -3.66359281539917, "eval_rewards/margins": 1.5475611686706543, "eval_rewards/rejected": -5.211153984069824, "eval_runtime": 1082.16, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 7100 }, { "epoch": 0.47, "grad_norm": 46.25, "learning_rate": 3.2292969157479153e-06, "logits/chosen": -1.463730812072754, "logits/rejected": -1.384150743484497, "logps/chosen": -550.3345947265625, "logps/rejected": -651.9660034179688, "loss": 0.4044, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2190284729003906, "rewards/margins": 1.6009973287582397, "rewards/rejected": -4.820025444030762, "step": 7110 }, { "epoch": 0.47, "grad_norm": 64.0, "learning_rate": 3.223833068120276e-06, "logits/chosen": -1.5266828536987305, "logits/rejected": -1.0495185852050781, "logps/chosen": -646.2396240234375, "logps/rejected": -787.3951416015625, "loss": 0.5022, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.0756330490112305, "rewards/margins": 1.8406120538711548, "rewards/rejected": -5.916245460510254, "step": 7120 }, { "epoch": 0.47, "grad_norm": 9.1875, "learning_rate": 3.2183654440757023e-06, "logits/chosen": -1.4513148069381714, "logits/rejected": -0.9978975057601929, "logps/chosen": -718.1594848632812, "logps/rejected": -881.6779174804688, "loss": 0.4475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.303107261657715, "rewards/margins": 1.9463388919830322, "rewards/rejected": -6.249446392059326, "step": 7130 }, { "epoch": 0.47, "grad_norm": 11.0, "learning_rate": 3.2128940721401474e-06, "logits/chosen": -1.3465971946716309, "logits/rejected": -1.172060251235962, "logps/chosen": -744.1934204101562, "logps/rejected": -880.4222412109375, "loss": 0.4812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.622055530548096, "rewards/margins": 1.640036940574646, "rewards/rejected": -6.262092113494873, "step": 7140 }, { "epoch": 0.47, "grad_norm": 11.5625, "learning_rate": 3.2074189808591182e-06, "logits/chosen": -1.0147311687469482, "logits/rejected": -0.7057236433029175, "logps/chosen": -652.673095703125, "logps/rejected": -807.462158203125, "loss": 0.3714, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.453369140625, "rewards/margins": 1.642346739768982, "rewards/rejected": -6.095715522766113, "step": 7150 }, { "epoch": 0.47, "grad_norm": 7.71875, "learning_rate": 3.201940198797526e-06, "logits/chosen": -1.3710700273513794, "logits/rejected": -1.1725952625274658, "logps/chosen": -717.8284912109375, "logps/rejected": -904.71044921875, "loss": 0.5299, "rewards/accuracies": 0.75, "rewards/chosen": -4.878032684326172, "rewards/margins": 1.7449219226837158, "rewards/rejected": -6.622954368591309, "step": 7160 }, { "epoch": 0.47, "grad_norm": 29.0, "learning_rate": 3.196457754539538e-06, "logits/chosen": -1.460298776626587, "logits/rejected": -0.585815966129303, "logps/chosen": -650.5462036132812, "logps/rejected": -851.5245971679688, "loss": 0.417, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.9128952026367188, "rewards/margins": 2.3818061351776123, "rewards/rejected": -6.294702053070068, "step": 7170 }, { "epoch": 0.47, "grad_norm": 16.875, "learning_rate": 3.190971676688427e-06, "logits/chosen": -1.3092660903930664, "logits/rejected": -1.3981274366378784, "logps/chosen": -669.8778076171875, "logps/rejected": -878.0070190429688, "loss": 0.3421, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.033218860626221, "rewards/margins": 1.9290132522583008, "rewards/rejected": -5.9622321128845215, "step": 7180 }, { "epoch": 0.47, "grad_norm": 18.75, "learning_rate": 3.1854819938664245e-06, "logits/chosen": -1.2578312158584595, "logits/rejected": -1.2489131689071655, "logps/chosen": -646.1082153320312, "logps/rejected": -814.2697143554688, "loss": 0.5089, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.804719924926758, "rewards/margins": 1.7093582153320312, "rewards/rejected": -5.514077663421631, "step": 7190 }, { "epoch": 0.47, "grad_norm": 17.125, "learning_rate": 3.17998873471457e-06, "logits/chosen": -1.3048036098480225, "logits/rejected": -1.2573864459991455, "logps/chosen": -711.3955688476562, "logps/rejected": -957.9508666992188, "loss": 0.7849, "rewards/accuracies": 0.625, "rewards/chosen": -4.599392890930176, "rewards/margins": 1.404470443725586, "rewards/rejected": -6.0038628578186035, "step": 7200 }, { "epoch": 0.47, "eval_logits/chosen": -1.4047660827636719, "eval_logits/rejected": -1.117523431777954, "eval_logps/chosen": -667.6451416015625, "eval_logps/rejected": -811.4156494140625, "eval_loss": 0.5287602543830872, "eval_rewards/accuracies": 0.7379999756813049, "eval_rewards/chosen": -4.030255317687988, "eval_rewards/margins": 1.63818359375, "eval_rewards/rejected": -5.6684393882751465, "eval_runtime": 1082.9832, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 7200 }, { "epoch": 0.47, "grad_norm": 99.0, "learning_rate": 3.174491927892561e-06, "logits/chosen": -1.3338240385055542, "logits/rejected": -1.1682806015014648, "logps/chosen": -595.0701904296875, "logps/rejected": -810.501708984375, "loss": 0.3612, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.484051465988159, "rewards/margins": 2.3265881538391113, "rewards/rejected": -5.810639381408691, "step": 7210 }, { "epoch": 0.47, "grad_norm": 24.25, "learning_rate": 3.1689916020786037e-06, "logits/chosen": -1.454010248184204, "logits/rejected": -0.8474670648574829, "logps/chosen": -667.0938720703125, "logps/rejected": -754.9188232421875, "loss": 0.577, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.159677505493164, "rewards/margins": 1.2116779088974, "rewards/rejected": -5.3713555335998535, "step": 7220 }, { "epoch": 0.47, "grad_norm": 50.25, "learning_rate": 3.1634877859692652e-06, "logits/chosen": -1.5685755014419556, "logits/rejected": -1.0399705171585083, "logps/chosen": -598.0698852539062, "logps/rejected": -801.1921997070312, "loss": 0.5213, "rewards/accuracies": 0.75, "rewards/chosen": -3.814943790435791, "rewards/margins": 1.74563467502594, "rewards/rejected": -5.560577869415283, "step": 7230 }, { "epoch": 0.47, "grad_norm": 31.25, "learning_rate": 3.1579805082793202e-06, "logits/chosen": -1.2529449462890625, "logits/rejected": -1.2278773784637451, "logps/chosen": -663.9173583984375, "logps/rejected": -786.9188842773438, "loss": 0.6456, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.284888744354248, "rewards/margins": 1.3399271965026855, "rewards/rejected": -5.624815940856934, "step": 7240 }, { "epoch": 0.47, "grad_norm": 9.75, "learning_rate": 3.152469797741605e-06, "logits/chosen": -1.1691783666610718, "logits/rejected": -1.1624130010604858, "logps/chosen": -675.5157470703125, "logps/rejected": -851.5408935546875, "loss": 0.5285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.925585985183716, "rewards/margins": 1.9795547723770142, "rewards/rejected": -5.9051408767700195, "step": 7250 }, { "epoch": 0.48, "grad_norm": 88.0, "learning_rate": 3.146955683106867e-06, "logits/chosen": -1.2714333534240723, "logits/rejected": -1.08028244972229, "logps/chosen": -632.1842041015625, "logps/rejected": -867.6624145507812, "loss": 0.4951, "rewards/accuracies": 0.75, "rewards/chosen": -3.733079433441162, "rewards/margins": 2.303467273712158, "rewards/rejected": -6.0365471839904785, "step": 7260 }, { "epoch": 0.48, "grad_norm": 16.375, "learning_rate": 3.1414381931436094e-06, "logits/chosen": -1.3526496887207031, "logits/rejected": -1.1084474325180054, "logps/chosen": -590.5748291015625, "logps/rejected": -680.8483276367188, "loss": 0.4986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.2272675037384033, "rewards/margins": 1.4602758884429932, "rewards/rejected": -4.687542915344238, "step": 7270 }, { "epoch": 0.48, "grad_norm": 24.0, "learning_rate": 3.1359173566379503e-06, "logits/chosen": -1.6744945049285889, "logits/rejected": -1.212215781211853, "logps/chosen": -666.6182861328125, "logps/rejected": -748.7386474609375, "loss": 0.5171, "rewards/accuracies": 0.75, "rewards/chosen": -3.730123519897461, "rewards/margins": 1.3492710590362549, "rewards/rejected": -5.079394340515137, "step": 7280 }, { "epoch": 0.48, "grad_norm": 4.375, "learning_rate": 3.130393202393464e-06, "logits/chosen": -1.1931312084197998, "logits/rejected": -1.2022308111190796, "logps/chosen": -590.35107421875, "logps/rejected": -792.2325439453125, "loss": 0.4547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.158155679702759, "rewards/margins": 2.2016148567199707, "rewards/rejected": -5.359770774841309, "step": 7290 }, { "epoch": 0.48, "grad_norm": 10.0625, "learning_rate": 3.124865759231035e-06, "logits/chosen": -1.4116827249526978, "logits/rejected": -1.044814109802246, "logps/chosen": -642.3245849609375, "logps/rejected": -793.8795776367188, "loss": 0.3462, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5619442462921143, "rewards/margins": 1.8919264078140259, "rewards/rejected": -5.45387077331543, "step": 7300 }, { "epoch": 0.48, "eval_logits/chosen": -1.3266592025756836, "eval_logits/rejected": -1.0407497882843018, "eval_logps/chosen": -688.275634765625, "eval_logps/rejected": -840.9078979492188, "eval_loss": 0.5394913554191589, "eval_rewards/accuracies": 0.734499990940094, "eval_rewards/chosen": -4.236560344696045, "eval_rewards/margins": 1.7268023490905762, "eval_rewards/rejected": -5.963362693786621, "eval_runtime": 1082.5778, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 7300 }, { "epoch": 0.48, "grad_norm": 22.25, "learning_rate": 3.119335055988709e-06, "logits/chosen": -1.200289011001587, "logits/rejected": -0.7436946034431458, "logps/chosen": -706.0857543945312, "logps/rejected": -775.4239501953125, "loss": 0.7426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.4149169921875, "rewards/margins": 1.5412516593933105, "rewards/rejected": -5.9561686515808105, "step": 7310 }, { "epoch": 0.48, "grad_norm": 25.5, "learning_rate": 3.1138011215215392e-06, "logits/chosen": -1.3326818943023682, "logits/rejected": -1.2478901147842407, "logps/chosen": -599.475830078125, "logps/rejected": -836.8077392578125, "loss": 0.3841, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.738217830657959, "rewards/margins": 2.167809247970581, "rewards/rejected": -5.906027793884277, "step": 7320 }, { "epoch": 0.48, "grad_norm": 49.25, "learning_rate": 3.108263984701434e-06, "logits/chosen": -0.934039294719696, "logits/rejected": -1.1128921508789062, "logps/chosen": -677.9228515625, "logps/rejected": -859.9694213867188, "loss": 0.4558, "rewards/accuracies": 0.75, "rewards/chosen": -4.298111438751221, "rewards/margins": 2.0508508682250977, "rewards/rejected": -6.348962306976318, "step": 7330 }, { "epoch": 0.48, "grad_norm": 27.25, "learning_rate": 3.1027236744170135e-06, "logits/chosen": -1.053406000137329, "logits/rejected": -1.043914794921875, "logps/chosen": -667.1046142578125, "logps/rejected": -942.5146484375, "loss": 0.4343, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.283786773681641, "rewards/margins": 2.4078726768493652, "rewards/rejected": -6.691660404205322, "step": 7340 }, { "epoch": 0.48, "grad_norm": 104.5, "learning_rate": 3.0971802195734524e-06, "logits/chosen": -0.934005081653595, "logits/rejected": -0.3034282326698303, "logps/chosen": -638.2991333007812, "logps/rejected": -864.8499145507812, "loss": 0.5541, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.5376715660095215, "rewards/margins": 2.1259982585906982, "rewards/rejected": -6.663669586181641, "step": 7350 }, { "epoch": 0.48, "grad_norm": 63.0, "learning_rate": 3.091633649092331e-06, "logits/chosen": -1.4932953119277954, "logits/rejected": -0.6129779815673828, "logps/chosen": -606.0397338867188, "logps/rejected": -726.544189453125, "loss": 0.5489, "rewards/accuracies": 0.75, "rewards/chosen": -3.594912052154541, "rewards/margins": 1.975142478942871, "rewards/rejected": -5.570054531097412, "step": 7360 }, { "epoch": 0.48, "grad_norm": 13.625, "learning_rate": 3.086083991911487e-06, "logits/chosen": -1.3566482067108154, "logits/rejected": -0.8617610931396484, "logps/chosen": -716.4907836914062, "logps/rejected": -830.501953125, "loss": 0.3769, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9754276275634766, "rewards/margins": 1.741614580154419, "rewards/rejected": -5.717041969299316, "step": 7370 }, { "epoch": 0.48, "grad_norm": 16.75, "learning_rate": 3.0805312769848595e-06, "logits/chosen": -1.6154024600982666, "logits/rejected": -0.9892219305038452, "logps/chosen": -744.7152099609375, "logps/rejected": -769.2413940429688, "loss": 0.5789, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.856018543243408, "rewards/margins": 1.2163625955581665, "rewards/rejected": -5.072381496429443, "step": 7380 }, { "epoch": 0.48, "grad_norm": 21.5, "learning_rate": 3.0749755332823426e-06, "logits/chosen": -1.6563608646392822, "logits/rejected": -1.2646421194076538, "logps/chosen": -710.4370727539062, "logps/rejected": -837.8865966796875, "loss": 0.7181, "rewards/accuracies": 0.75, "rewards/chosen": -4.319083213806152, "rewards/margins": 1.2025281190872192, "rewards/rejected": -5.52161169052124, "step": 7390 }, { "epoch": 0.48, "grad_norm": 14.125, "learning_rate": 3.0694167897896304e-06, "logits/chosen": -1.1282389163970947, "logits/rejected": -1.194897174835205, "logps/chosen": -603.3027954101562, "logps/rejected": -794.1591186523438, "loss": 0.4616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.6348769664764404, "rewards/margins": 1.858367919921875, "rewards/rejected": -5.4932451248168945, "step": 7400 }, { "epoch": 0.48, "eval_logits/chosen": -1.4319820404052734, "eval_logits/rejected": -1.1111419200897217, "eval_logps/chosen": -624.17822265625, "eval_logps/rejected": -768.3162841796875, "eval_loss": 0.5362439751625061, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -3.5955865383148193, "eval_rewards/margins": 1.6418592929840088, "eval_rewards/rejected": -5.237445831298828, "eval_runtime": 1082.5693, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 7400 }, { "epoch": 0.48, "grad_norm": 38.0, "learning_rate": 3.0638550755080675e-06, "logits/chosen": -1.7549679279327393, "logits/rejected": -1.100928783416748, "logps/chosen": -680.9194946289062, "logps/rejected": -733.08935546875, "loss": 0.7468, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.9301037788391113, "rewards/margins": 1.3256875276565552, "rewards/rejected": -5.255790710449219, "step": 7410 }, { "epoch": 0.49, "grad_norm": 61.0, "learning_rate": 3.0582904194545023e-06, "logits/chosen": -0.974675178527832, "logits/rejected": -0.7982308268547058, "logps/chosen": -589.9420166015625, "logps/rejected": -738.0842895507812, "loss": 0.6045, "rewards/accuracies": 0.75, "rewards/chosen": -3.7465102672576904, "rewards/margins": 1.5488064289093018, "rewards/rejected": -5.295316696166992, "step": 7420 }, { "epoch": 0.49, "grad_norm": 14.125, "learning_rate": 3.0527228506611254e-06, "logits/chosen": -1.2976282835006714, "logits/rejected": -1.314902663230896, "logps/chosen": -535.6421508789062, "logps/rejected": -740.2081298828125, "loss": 0.5299, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.790461778640747, "rewards/margins": 1.9214391708374023, "rewards/rejected": -4.711899757385254, "step": 7430 }, { "epoch": 0.49, "grad_norm": 15.3125, "learning_rate": 3.0471523981753266e-06, "logits/chosen": -1.568801999092102, "logits/rejected": -1.4353315830230713, "logps/chosen": -531.3270874023438, "logps/rejected": -643.4146118164062, "loss": 0.4047, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.762486219406128, "rewards/margins": 1.4796106815338135, "rewards/rejected": -4.242096900939941, "step": 7440 }, { "epoch": 0.49, "grad_norm": 6.0, "learning_rate": 3.0415790910595412e-06, "logits/chosen": -1.6522433757781982, "logits/rejected": -1.2307037115097046, "logps/chosen": -634.4879150390625, "logps/rejected": -861.2034301757812, "loss": 0.2648, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.5103600025177, "rewards/margins": 2.2581076622009277, "rewards/rejected": -5.768467426300049, "step": 7450 }, { "epoch": 0.49, "grad_norm": 13.5, "learning_rate": 3.0360029583910956e-06, "logits/chosen": -1.32919180393219, "logits/rejected": -1.0612585544586182, "logps/chosen": -703.5370483398438, "logps/rejected": -808.89306640625, "loss": 0.6464, "rewards/accuracies": 0.75, "rewards/chosen": -4.018516540527344, "rewards/margins": 1.3439137935638428, "rewards/rejected": -5.362430572509766, "step": 7460 }, { "epoch": 0.49, "grad_norm": 12.5, "learning_rate": 3.0304240292620598e-06, "logits/chosen": -1.5263416767120361, "logits/rejected": -1.1627568006515503, "logps/chosen": -713.2200927734375, "logps/rejected": -838.6053466796875, "loss": 0.5232, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.221181869506836, "rewards/margins": 1.7662479877471924, "rewards/rejected": -5.987429618835449, "step": 7470 }, { "epoch": 0.49, "grad_norm": 38.25, "learning_rate": 3.0248423327790938e-06, "logits/chosen": -1.2267783880233765, "logits/rejected": -1.0435937643051147, "logps/chosen": -628.8931274414062, "logps/rejected": -878.0484619140625, "loss": 0.4983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.695591688156128, "rewards/margins": 2.278515338897705, "rewards/rejected": -5.974107265472412, "step": 7480 }, { "epoch": 0.49, "grad_norm": 2.9375, "learning_rate": 3.019257898063294e-06, "logits/chosen": -1.3420478105545044, "logits/rejected": -1.3277950286865234, "logps/chosen": -602.5700073242188, "logps/rejected": -807.8358154296875, "loss": 0.4688, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5410866737365723, "rewards/margins": 1.8236157894134521, "rewards/rejected": -5.364701747894287, "step": 7490 }, { "epoch": 0.49, "grad_norm": 33.75, "learning_rate": 3.0136707542500438e-06, "logits/chosen": -1.6389806270599365, "logits/rejected": -1.1611101627349854, "logps/chosen": -631.3812255859375, "logps/rejected": -740.4097900390625, "loss": 0.4879, "rewards/accuracies": 0.75, "rewards/chosen": -3.860243320465088, "rewards/margins": 1.4965717792510986, "rewards/rejected": -5.356815338134766, "step": 7500 }, { "epoch": 0.49, "eval_logits/chosen": -1.4181184768676758, "eval_logits/rejected": -1.1543266773223877, "eval_logps/chosen": -660.9016723632812, "eval_logps/rejected": -803.4813842773438, "eval_loss": 0.5310589671134949, "eval_rewards/accuracies": 0.7400000095367432, "eval_rewards/chosen": -3.9628207683563232, "eval_rewards/margins": 1.6262763738632202, "eval_rewards/rejected": -5.589096546173096, "eval_runtime": 1082.6625, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 7500 }, { "epoch": 0.49, "grad_norm": 52.75, "learning_rate": 3.008080930488861e-06, "logits/chosen": -1.6605017185211182, "logits/rejected": -1.145690679550171, "logps/chosen": -695.2445678710938, "logps/rejected": -835.0499267578125, "loss": 0.6846, "rewards/accuracies": 0.75, "rewards/chosen": -3.8979949951171875, "rewards/margins": 1.5793064832687378, "rewards/rejected": -5.477301120758057, "step": 7510 }, { "epoch": 0.49, "grad_norm": 14.125, "learning_rate": 3.0024884559432434e-06, "logits/chosen": -1.3628549575805664, "logits/rejected": -0.7684060335159302, "logps/chosen": -657.506103515625, "logps/rejected": -802.8294677734375, "loss": 0.5036, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.987572193145752, "rewards/margins": 1.9618732929229736, "rewards/rejected": -5.9494452476501465, "step": 7520 }, { "epoch": 0.49, "grad_norm": 4.125, "learning_rate": 2.9968933597905204e-06, "logits/chosen": -1.2994136810302734, "logits/rejected": -1.2082409858703613, "logps/chosen": -653.8195190429688, "logps/rejected": -745.1570434570312, "loss": 0.7532, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7814602851867676, "rewards/margins": 1.3759765625, "rewards/rejected": -5.157436847686768, "step": 7530 }, { "epoch": 0.49, "grad_norm": 8.5, "learning_rate": 2.9912956712217e-06, "logits/chosen": -1.7122999429702759, "logits/rejected": -1.5942037105560303, "logps/chosen": -565.8399047851562, "logps/rejected": -683.4976196289062, "loss": 0.5274, "rewards/accuracies": 0.75, "rewards/chosen": -3.212390184402466, "rewards/margins": 1.5873216390609741, "rewards/rejected": -4.799711227416992, "step": 7540 }, { "epoch": 0.49, "grad_norm": 11.0, "learning_rate": 2.9856954194413123e-06, "logits/chosen": -1.3582313060760498, "logits/rejected": -1.1383432149887085, "logps/chosen": -602.5155029296875, "logps/rejected": -803.2645874023438, "loss": 0.3242, "rewards/accuracies": 0.875, "rewards/chosen": -3.3019535541534424, "rewards/margins": 2.2614340782165527, "rewards/rejected": -5.563386917114258, "step": 7550 }, { "epoch": 0.49, "grad_norm": 25.375, "learning_rate": 2.9800926336672637e-06, "logits/chosen": -1.1484405994415283, "logits/rejected": -1.225892186164856, "logps/chosen": -647.567138671875, "logps/rejected": -784.2089233398438, "loss": 0.6556, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.9338977336883545, "rewards/margins": 1.322043538093567, "rewards/rejected": -5.255941390991211, "step": 7560 }, { "epoch": 0.5, "grad_norm": 13.1875, "learning_rate": 2.9744873431306777e-06, "logits/chosen": -1.3222450017929077, "logits/rejected": -1.05946946144104, "logps/chosen": -618.5274658203125, "logps/rejected": -768.5072021484375, "loss": 0.5559, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7230262756347656, "rewards/margins": 1.5186145305633545, "rewards/rejected": -5.241640567779541, "step": 7570 }, { "epoch": 0.5, "grad_norm": 9.125, "learning_rate": 2.968879577075749e-06, "logits/chosen": -1.5681188106536865, "logits/rejected": -1.0411336421966553, "logps/chosen": -586.0208740234375, "logps/rejected": -753.9694213867188, "loss": 0.604, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.598011016845703, "rewards/margins": 1.7839431762695312, "rewards/rejected": -5.381953716278076, "step": 7580 }, { "epoch": 0.5, "grad_norm": 13.8125, "learning_rate": 2.9632693647595856e-06, "logits/chosen": -1.5468661785125732, "logits/rejected": -0.9271882176399231, "logps/chosen": -665.0424194335938, "logps/rejected": -826.5350341796875, "loss": 0.4356, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.079390525817871, "rewards/margins": 1.9610122442245483, "rewards/rejected": -6.040403366088867, "step": 7590 }, { "epoch": 0.5, "grad_norm": 25.75, "learning_rate": 2.9576567354520593e-06, "logits/chosen": -1.3770414590835571, "logits/rejected": -0.7890729904174805, "logps/chosen": -735.6774291992188, "logps/rejected": -827.7765502929688, "loss": 0.6047, "rewards/accuracies": 0.75, "rewards/chosen": -4.256994724273682, "rewards/margins": 1.4845167398452759, "rewards/rejected": -5.741511344909668, "step": 7600 }, { "epoch": 0.5, "eval_logits/chosen": -1.5299447774887085, "eval_logits/rejected": -1.2726281881332397, "eval_logps/chosen": -625.3944702148438, "eval_logps/rejected": -764.4761352539062, "eval_loss": 0.5196506977081299, "eval_rewards/accuracies": 0.7440000176429749, "eval_rewards/chosen": -3.6077494621276855, "eval_rewards/margins": 1.5912953615188599, "eval_rewards/rejected": -5.199044227600098, "eval_runtime": 1081.8807, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 7600 }, { "epoch": 0.5, "grad_norm": 10.0625, "learning_rate": 2.9520417184356504e-06, "logits/chosen": -1.6309735774993896, "logits/rejected": -0.8396612405776978, "logps/chosen": -630.0804443359375, "logps/rejected": -754.0598754882812, "loss": 0.4387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7775683403015137, "rewards/margins": 1.7922008037567139, "rewards/rejected": -5.569769382476807, "step": 7610 }, { "epoch": 0.5, "grad_norm": 21.625, "learning_rate": 2.9464243430052992e-06, "logits/chosen": -1.4004886150360107, "logits/rejected": -1.6711006164550781, "logps/chosen": -646.9445190429688, "logps/rejected": -757.7483520507812, "loss": 0.6243, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7711708545684814, "rewards/margins": 1.18333101272583, "rewards/rejected": -4.954502105712891, "step": 7620 }, { "epoch": 0.5, "grad_norm": 28.375, "learning_rate": 2.940804638468247e-06, "logits/chosen": -1.663377046585083, "logits/rejected": -1.2936443090438843, "logps/chosen": -660.186767578125, "logps/rejected": -753.7015380859375, "loss": 0.6791, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.4665169715881348, "rewards/margins": 1.2230207920074463, "rewards/rejected": -4.689537525177002, "step": 7630 }, { "epoch": 0.5, "grad_norm": 20.5, "learning_rate": 2.9351826341438895e-06, "logits/chosen": -1.0849730968475342, "logits/rejected": -1.321204423904419, "logps/chosen": -572.1552734375, "logps/rejected": -797.3029174804688, "loss": 0.6004, "rewards/accuracies": 0.75, "rewards/chosen": -3.632864475250244, "rewards/margins": 1.6436235904693604, "rewards/rejected": -5.276488304138184, "step": 7640 }, { "epoch": 0.5, "grad_norm": 14.0625, "learning_rate": 2.92955835936362e-06, "logits/chosen": -1.2459595203399658, "logits/rejected": -1.1905423402786255, "logps/chosen": -540.1290893554688, "logps/rejected": -697.8605346679688, "loss": 0.5978, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3062007427215576, "rewards/margins": 1.1906124353408813, "rewards/rejected": -4.4968132972717285, "step": 7650 }, { "epoch": 0.5, "grad_norm": 9.5625, "learning_rate": 2.9239318434706776e-06, "logits/chosen": -1.608249306678772, "logits/rejected": -1.3472509384155273, "logps/chosen": -591.3645629882812, "logps/rejected": -747.9082641601562, "loss": 0.4272, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.521559953689575, "rewards/margins": 1.927687644958496, "rewards/rejected": -5.449248313903809, "step": 7660 }, { "epoch": 0.5, "grad_norm": 37.0, "learning_rate": 2.918303115819992e-06, "logits/chosen": -1.0876350402832031, "logits/rejected": -0.8331940770149231, "logps/chosen": -550.4801025390625, "logps/rejected": -760.7517700195312, "loss": 0.3494, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1947948932647705, "rewards/margins": 2.112136125564575, "rewards/rejected": -5.3069305419921875, "step": 7670 }, { "epoch": 0.5, "grad_norm": 36.25, "learning_rate": 2.9126722057780344e-06, "logits/chosen": -1.5587748289108276, "logits/rejected": -1.0833650827407837, "logps/chosen": -768.263671875, "logps/rejected": -948.2453002929688, "loss": 0.5466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.37026309967041, "rewards/margins": 2.3177340030670166, "rewards/rejected": -6.687996864318848, "step": 7680 }, { "epoch": 0.5, "grad_norm": 13.1875, "learning_rate": 2.9070391427226624e-06, "logits/chosen": -1.3349508047103882, "logits/rejected": -1.1920472383499146, "logps/chosen": -665.2405395507812, "logps/rejected": -743.0135498046875, "loss": 0.6641, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.105926513671875, "rewards/margins": 1.0519648790359497, "rewards/rejected": -5.157891273498535, "step": 7690 }, { "epoch": 0.5, "grad_norm": 34.0, "learning_rate": 2.9014039560429634e-06, "logits/chosen": -1.7162189483642578, "logits/rejected": -1.5879261493682861, "logps/chosen": -654.9056396484375, "logps/rejected": -767.081298828125, "loss": 0.5471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7597122192382812, "rewards/margins": 1.4864389896392822, "rewards/rejected": -5.246151924133301, "step": 7700 }, { "epoch": 0.5, "eval_logits/chosen": -1.5228487253189087, "eval_logits/rejected": -1.277574896812439, "eval_logps/chosen": -606.4263305664062, "eval_logps/rejected": -740.7102661132812, "eval_loss": 0.5190695524215698, "eval_rewards/accuracies": 0.7379999756813049, "eval_rewards/chosen": -3.418067693710327, "eval_rewards/margins": 1.5433179140090942, "eval_rewards/rejected": -4.961385250091553, "eval_runtime": 1081.7829, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 7700 }, { "epoch": 0.5, "grad_norm": 22.875, "learning_rate": 2.8957666751391084e-06, "logits/chosen": -1.7355930805206299, "logits/rejected": -1.449190378189087, "logps/chosen": -535.7609252929688, "logps/rejected": -654.2330932617188, "loss": 0.6148, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.2646496295928955, "rewards/margins": 1.0576517581939697, "rewards/rejected": -4.322301387786865, "step": 7710 }, { "epoch": 0.51, "grad_norm": 88.5, "learning_rate": 2.89012732942219e-06, "logits/chosen": -1.852916955947876, "logits/rejected": -1.673484444618225, "logps/chosen": -589.73681640625, "logps/rejected": -715.338134765625, "loss": 0.5185, "rewards/accuracies": 0.75, "rewards/chosen": -3.0930979251861572, "rewards/margins": 1.5410335063934326, "rewards/rejected": -4.634130954742432, "step": 7720 }, { "epoch": 0.51, "grad_norm": 29.375, "learning_rate": 2.884485948314078e-06, "logits/chosen": -1.4500287771224976, "logits/rejected": -1.2408530712127686, "logps/chosen": -581.4036254882812, "logps/rejected": -654.4486083984375, "loss": 0.6957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4938607215881348, "rewards/margins": 0.7534639835357666, "rewards/rejected": -4.247324466705322, "step": 7730 }, { "epoch": 0.51, "grad_norm": 36.25, "learning_rate": 2.878842561247257e-06, "logits/chosen": -1.3081271648406982, "logits/rejected": -0.2527647316455841, "logps/chosen": -620.2591552734375, "logps/rejected": -792.27734375, "loss": 0.4249, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.473395824432373, "rewards/margins": 1.9879125356674194, "rewards/rejected": -5.46130895614624, "step": 7740 }, { "epoch": 0.51, "grad_norm": 11.5, "learning_rate": 2.8731971976646805e-06, "logits/chosen": -1.2632343769073486, "logits/rejected": -0.9273570775985718, "logps/chosen": -585.7144775390625, "logps/rejected": -745.6323852539062, "loss": 0.4315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2459537982940674, "rewards/margins": 1.8653161525726318, "rewards/rejected": -5.111269950866699, "step": 7750 }, { "epoch": 0.51, "grad_norm": 9.75, "learning_rate": 2.8675498870196134e-06, "logits/chosen": -1.8961451053619385, "logits/rejected": -1.8421748876571655, "logps/chosen": -685.48779296875, "logps/rejected": -851.2835693359375, "loss": 0.3937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3051581382751465, "rewards/margins": 1.7909977436065674, "rewards/rejected": -5.096156120300293, "step": 7760 }, { "epoch": 0.51, "grad_norm": 59.5, "learning_rate": 2.861900658775477e-06, "logits/chosen": -1.5334454774856567, "logits/rejected": -1.5334103107452393, "logps/chosen": -586.7137451171875, "logps/rejected": -685.27880859375, "loss": 0.7974, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.3829848766326904, "rewards/margins": 0.9013487100601196, "rewards/rejected": -4.284333229064941, "step": 7770 }, { "epoch": 0.51, "grad_norm": 41.25, "learning_rate": 2.8562495424056985e-06, "logits/chosen": -1.590763807296753, "logits/rejected": -1.1857455968856812, "logps/chosen": -542.5369873046875, "logps/rejected": -703.8079833984375, "loss": 0.5379, "rewards/accuracies": 0.75, "rewards/chosen": -3.0740978717803955, "rewards/margins": 2.1772849559783936, "rewards/rejected": -5.251383304595947, "step": 7780 }, { "epoch": 0.51, "grad_norm": 17.375, "learning_rate": 2.8505965673935563e-06, "logits/chosen": -1.689965009689331, "logits/rejected": -1.2544862031936646, "logps/chosen": -593.6925048828125, "logps/rejected": -789.8462524414062, "loss": 0.262, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.1679325103759766, "rewards/margins": 2.271761655807495, "rewards/rejected": -5.439694404602051, "step": 7790 }, { "epoch": 0.51, "grad_norm": 15.75, "learning_rate": 2.8449417632320257e-06, "logits/chosen": -1.5398682355880737, "logits/rejected": -1.0652512311935425, "logps/chosen": -657.2113647460938, "logps/rejected": -835.1046752929688, "loss": 0.3957, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.581960678100586, "rewards/margins": 2.3023619651794434, "rewards/rejected": -5.884322643280029, "step": 7800 }, { "epoch": 0.51, "eval_logits/chosen": -1.5133934020996094, "eval_logits/rejected": -1.2424006462097168, "eval_logps/chosen": -620.6990966796875, "eval_logps/rejected": -765.4808349609375, "eval_loss": 0.5341325402259827, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -3.560795545578003, "eval_rewards/margins": 1.6482951641082764, "eval_rewards/rejected": -5.209090709686279, "eval_runtime": 1082.1937, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 7800 }, { "epoch": 0.51, "grad_norm": 9.0625, "learning_rate": 2.839285159423624e-06, "logits/chosen": -1.332390546798706, "logits/rejected": -1.5053774118423462, "logps/chosen": -648.623779296875, "logps/rejected": -817.5202026367188, "loss": 0.5601, "rewards/accuracies": 0.75, "rewards/chosen": -3.7404580116271973, "rewards/margins": 1.627743124961853, "rewards/rejected": -5.36820125579834, "step": 7810 }, { "epoch": 0.51, "grad_norm": 25.875, "learning_rate": 2.8336267854802596e-06, "logits/chosen": -1.3925707340240479, "logits/rejected": -1.5194270610809326, "logps/chosen": -618.7852783203125, "logps/rejected": -769.59619140625, "loss": 0.4864, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.728842258453369, "rewards/margins": 1.5971170663833618, "rewards/rejected": -5.325959205627441, "step": 7820 }, { "epoch": 0.51, "grad_norm": 34.75, "learning_rate": 2.8279666709230748e-06, "logits/chosen": -1.5318679809570312, "logits/rejected": -1.4090325832366943, "logps/chosen": -669.8541870117188, "logps/rejected": -794.2620849609375, "loss": 0.5221, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.6274898052215576, "rewards/margins": 1.8522398471832275, "rewards/rejected": -5.479729652404785, "step": 7830 }, { "epoch": 0.51, "grad_norm": 32.75, "learning_rate": 2.8223048452822917e-06, "logits/chosen": -1.5705835819244385, "logits/rejected": -0.6458364725112915, "logps/chosen": -628.6612548828125, "logps/rejected": -741.3211669921875, "loss": 0.4715, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.614602565765381, "rewards/margins": 2.070050001144409, "rewards/rejected": -5.684652805328369, "step": 7840 }, { "epoch": 0.51, "grad_norm": 32.25, "learning_rate": 2.8166413380970624e-06, "logits/chosen": -1.3357844352722168, "logits/rejected": -1.2040369510650635, "logps/chosen": -622.555419921875, "logps/rejected": -798.7926025390625, "loss": 0.4537, "rewards/accuracies": 0.75, "rewards/chosen": -3.7341277599334717, "rewards/margins": 1.833524465560913, "rewards/rejected": -5.567652702331543, "step": 7850 }, { "epoch": 0.51, "grad_norm": 6.71875, "learning_rate": 2.810976178915312e-06, "logits/chosen": -1.716202735900879, "logits/rejected": -1.2240136861801147, "logps/chosen": -707.09716796875, "logps/rejected": -788.760986328125, "loss": 0.6708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.189077377319336, "rewards/margins": 1.1072841882705688, "rewards/rejected": -5.296360969543457, "step": 7860 }, { "epoch": 0.51, "grad_norm": 25.625, "learning_rate": 2.805309397293581e-06, "logits/chosen": -1.6097484827041626, "logits/rejected": -1.498447299003601, "logps/chosen": -676.7265625, "logps/rejected": -786.87451171875, "loss": 0.8168, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.9601166248321533, "rewards/margins": 1.3155524730682373, "rewards/rejected": -5.275669097900391, "step": 7870 }, { "epoch": 0.52, "grad_norm": 29.125, "learning_rate": 2.799641022796879e-06, "logits/chosen": -1.0337090492248535, "logits/rejected": -1.2438466548919678, "logps/chosen": -652.2096557617188, "logps/rejected": -762.60791015625, "loss": 0.6081, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.130144119262695, "rewards/margins": 1.384285569190979, "rewards/rejected": -5.514428615570068, "step": 7880 }, { "epoch": 0.52, "grad_norm": 26.625, "learning_rate": 2.793971084998522e-06, "logits/chosen": -1.3956397771835327, "logits/rejected": -1.3799974918365479, "logps/chosen": -573.8560180664062, "logps/rejected": -734.4013671875, "loss": 0.5846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.491074800491333, "rewards/margins": 1.429463267326355, "rewards/rejected": -4.920537948608398, "step": 7890 }, { "epoch": 0.52, "grad_norm": 13.4375, "learning_rate": 2.7882996134799854e-06, "logits/chosen": -1.8229824304580688, "logits/rejected": -1.2415785789489746, "logps/chosen": -683.0785522460938, "logps/rejected": -782.3349609375, "loss": 0.5307, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.729081630706787, "rewards/margins": 1.628623604774475, "rewards/rejected": -5.357705116271973, "step": 7900 }, { "epoch": 0.52, "eval_logits/chosen": -1.5020606517791748, "eval_logits/rejected": -1.2260446548461914, "eval_logps/chosen": -629.4216918945312, "eval_logps/rejected": -765.5830078125, "eval_loss": 0.5246554613113403, "eval_rewards/accuracies": 0.737500011920929, "eval_rewards/chosen": -3.6480209827423096, "eval_rewards/margins": 1.5620914697647095, "eval_rewards/rejected": -5.21011209487915, "eval_runtime": 1082.5991, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 7900 }, { "epoch": 0.52, "grad_norm": 76.5, "learning_rate": 2.7826266378307456e-06, "logits/chosen": -1.3603765964508057, "logits/rejected": -1.1335108280181885, "logps/chosen": -654.8795166015625, "logps/rejected": -793.8699951171875, "loss": 0.4724, "rewards/accuracies": 0.75, "rewards/chosen": -4.10591983795166, "rewards/margins": 1.7544143199920654, "rewards/rejected": -5.860334396362305, "step": 7910 }, { "epoch": 0.52, "grad_norm": 31.875, "learning_rate": 2.7769521876481235e-06, "logits/chosen": -1.4018828868865967, "logits/rejected": -1.0509719848632812, "logps/chosen": -670.3870849609375, "logps/rejected": -787.39013671875, "loss": 0.6598, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.151216983795166, "rewards/margins": 1.2019758224487305, "rewards/rejected": -5.3531928062438965, "step": 7920 }, { "epoch": 0.52, "grad_norm": 56.5, "learning_rate": 2.771276292537138e-06, "logits/chosen": -1.3616807460784912, "logits/rejected": -0.9358698129653931, "logps/chosen": -704.0479125976562, "logps/rejected": -855.2088012695312, "loss": 0.4448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.346724033355713, "rewards/margins": 1.7752158641815186, "rewards/rejected": -6.121939659118652, "step": 7930 }, { "epoch": 0.52, "grad_norm": 61.75, "learning_rate": 2.7655989821103423e-06, "logits/chosen": -1.4514845609664917, "logits/rejected": -0.9389088749885559, "logps/chosen": -712.8014526367188, "logps/rejected": -758.5474853515625, "loss": 0.8295, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.732651710510254, "rewards/margins": 0.850357711315155, "rewards/rejected": -5.583009719848633, "step": 7940 }, { "epoch": 0.52, "grad_norm": 30.25, "learning_rate": 2.759920285987675e-06, "logits/chosen": -1.2129136323928833, "logits/rejected": -0.665821373462677, "logps/chosen": -611.8226928710938, "logps/rejected": -708.1654052734375, "loss": 0.7028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.975909471511841, "rewards/margins": 1.1963140964508057, "rewards/rejected": -5.1722235679626465, "step": 7950 }, { "epoch": 0.52, "grad_norm": 74.0, "learning_rate": 2.754240233796306e-06, "logits/chosen": -1.3825815916061401, "logits/rejected": -1.3320660591125488, "logps/chosen": -671.6278076171875, "logps/rejected": -724.0018310546875, "loss": 0.6874, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.829399824142456, "rewards/margins": 1.052416205406189, "rewards/rejected": -4.881816387176514, "step": 7960 }, { "epoch": 0.52, "grad_norm": 27.75, "learning_rate": 2.748558855170478e-06, "logits/chosen": -1.508163332939148, "logits/rejected": -1.2179902791976929, "logps/chosen": -710.49267578125, "logps/rejected": -757.9571533203125, "loss": 0.5627, "rewards/accuracies": 0.75, "rewards/chosen": -4.017090797424316, "rewards/margins": 0.8284121751785278, "rewards/rejected": -4.845502853393555, "step": 7970 }, { "epoch": 0.52, "grad_norm": 20.375, "learning_rate": 2.7428761797513535e-06, "logits/chosen": -1.5046226978302002, "logits/rejected": -1.0111268758773804, "logps/chosen": -591.57568359375, "logps/rejected": -738.3888549804688, "loss": 0.4461, "rewards/accuracies": 0.75, "rewards/chosen": -3.4044196605682373, "rewards/margins": 1.7217769622802734, "rewards/rejected": -5.12619686126709, "step": 7980 }, { "epoch": 0.52, "grad_norm": 10.875, "learning_rate": 2.7371922371868643e-06, "logits/chosen": -1.4601759910583496, "logits/rejected": -1.0335584878921509, "logps/chosen": -638.5921630859375, "logps/rejected": -851.85009765625, "loss": 0.3158, "rewards/accuracies": 0.875, "rewards/chosen": -3.767195463180542, "rewards/margins": 1.9210659265518188, "rewards/rejected": -5.68826150894165, "step": 7990 }, { "epoch": 0.52, "grad_norm": 33.5, "learning_rate": 2.731507057131548e-06, "logits/chosen": -1.6402997970581055, "logits/rejected": -1.0192586183547974, "logps/chosen": -693.2644653320312, "logps/rejected": -815.25244140625, "loss": 0.6165, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.480318546295166, "rewards/margins": 0.9681938886642456, "rewards/rejected": -5.448512077331543, "step": 8000 }, { "epoch": 0.52, "eval_logits/chosen": -1.3580044507980347, "eval_logits/rejected": -1.0660184621810913, "eval_logps/chosen": -719.4282836914062, "eval_logps/rejected": -859.5797119140625, "eval_loss": 0.5349783897399902, "eval_rewards/accuracies": 0.7384999990463257, "eval_rewards/chosen": -4.548086643218994, "eval_rewards/margins": 1.601993441581726, "eval_rewards/rejected": -6.150079250335693, "eval_runtime": 1082.1434, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 8000 }, { "epoch": 0.52, "grad_norm": 3.03125, "learning_rate": 2.7258206692464033e-06, "logits/chosen": -1.6001415252685547, "logits/rejected": -1.0552270412445068, "logps/chosen": -709.0032348632812, "logps/rejected": -863.8624877929688, "loss": 0.401, "rewards/accuracies": 0.875, "rewards/chosen": -4.405813694000244, "rewards/margins": 1.8645999431610107, "rewards/rejected": -6.270413398742676, "step": 8010 }, { "epoch": 0.52, "grad_norm": 48.5, "learning_rate": 2.7201331031987282e-06, "logits/chosen": -1.5670092105865479, "logits/rejected": -0.8951814770698547, "logps/chosen": -662.1847534179688, "logps/rejected": -744.3477783203125, "loss": 0.56, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.231276512145996, "rewards/margins": 1.4487323760986328, "rewards/rejected": -5.680008888244629, "step": 8020 }, { "epoch": 0.53, "grad_norm": 28.75, "learning_rate": 2.7144443886619676e-06, "logits/chosen": -0.9534385800361633, "logits/rejected": -0.9793499708175659, "logps/chosen": -793.2039794921875, "logps/rejected": -1004.7320556640625, "loss": 0.5108, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.072151184082031, "rewards/margins": 2.227297306060791, "rewards/rejected": -7.299448490142822, "step": 8030 }, { "epoch": 0.53, "grad_norm": 169.0, "learning_rate": 2.7087545553155577e-06, "logits/chosen": -0.9101541638374329, "logits/rejected": -0.6721819639205933, "logps/chosen": -791.4165649414062, "logps/rejected": -913.3650512695312, "loss": 0.4762, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.2363080978393555, "rewards/margins": 1.953226089477539, "rewards/rejected": -7.1895341873168945, "step": 8040 }, { "epoch": 0.53, "grad_norm": 27.875, "learning_rate": 2.703063632844773e-06, "logits/chosen": -1.336223840713501, "logits/rejected": -1.014787197113037, "logps/chosen": -876.7347412109375, "logps/rejected": -946.66015625, "loss": 0.7669, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.986825466156006, "rewards/margins": 1.0640041828155518, "rewards/rejected": -7.0508294105529785, "step": 8050 }, { "epoch": 0.53, "grad_norm": 27.125, "learning_rate": 2.6973716509405696e-06, "logits/chosen": -1.2908620834350586, "logits/rejected": -1.0295054912567139, "logps/chosen": -794.3842163085938, "logps/rejected": -971.4978637695312, "loss": 0.7108, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.522763729095459, "rewards/margins": 1.3005023002624512, "rewards/rejected": -6.82326602935791, "step": 8060 }, { "epoch": 0.53, "grad_norm": 39.25, "learning_rate": 2.691678639299431e-06, "logits/chosen": -1.0660008192062378, "logits/rejected": -0.9446924924850464, "logps/chosen": -829.6375122070312, "logps/rejected": -948.4158935546875, "loss": 0.5823, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.610718727111816, "rewards/margins": 1.309958577156067, "rewards/rejected": -6.920677185058594, "step": 8070 }, { "epoch": 0.53, "grad_norm": 17.25, "learning_rate": 2.6859846276232143e-06, "logits/chosen": -1.6054439544677734, "logits/rejected": -1.2499165534973145, "logps/chosen": -832.5867919921875, "logps/rejected": -920.7916259765625, "loss": 0.612, "rewards/accuracies": 0.75, "rewards/chosen": -5.0084428787231445, "rewards/margins": 1.4668790102005005, "rewards/rejected": -6.475321292877197, "step": 8080 }, { "epoch": 0.53, "grad_norm": 18.375, "learning_rate": 2.6802896456189918e-06, "logits/chosen": -1.2162902355194092, "logits/rejected": -0.8535488247871399, "logps/chosen": -768.1673583984375, "logps/rejected": -916.3607177734375, "loss": 0.3461, "rewards/accuracies": 0.875, "rewards/chosen": -5.2103705406188965, "rewards/margins": 1.914994478225708, "rewards/rejected": -7.125364780426025, "step": 8090 }, { "epoch": 0.53, "grad_norm": 41.0, "learning_rate": 2.6745937229989005e-06, "logits/chosen": -0.9392092823982239, "logits/rejected": -0.49442416429519653, "logps/chosen": -698.0362548828125, "logps/rejected": -903.4446411132812, "loss": 0.4843, "rewards/accuracies": 0.75, "rewards/chosen": -5.2168869972229, "rewards/margins": 1.9744939804077148, "rewards/rejected": -7.191380977630615, "step": 8100 }, { "epoch": 0.53, "eval_logits/chosen": -1.2202908992767334, "eval_logits/rejected": -0.9235472679138184, "eval_logps/chosen": -798.6174926757812, "eval_logps/rejected": -945.3572998046875, "eval_loss": 0.5416484475135803, "eval_rewards/accuracies": 0.734499990940094, "eval_rewards/chosen": -5.339978218078613, "eval_rewards/margins": 1.6678783893585205, "eval_rewards/rejected": -7.007856369018555, "eval_runtime": 1082.1841, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 8100 }, { "epoch": 0.53, "grad_norm": 28.125, "learning_rate": 2.6688968894799827e-06, "logits/chosen": -1.2646559476852417, "logits/rejected": -0.9963359832763672, "logps/chosen": -736.5479736328125, "logps/rejected": -805.6297607421875, "loss": 0.6125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.3408613204956055, "rewards/margins": 0.9383338093757629, "rewards/rejected": -6.279195308685303, "step": 8110 }, { "epoch": 0.53, "grad_norm": 19.25, "learning_rate": 2.6631991747840335e-06, "logits/chosen": -1.4416183233261108, "logits/rejected": -1.1871131658554077, "logps/chosen": -807.6776123046875, "logps/rejected": -920.7096557617188, "loss": 0.5944, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.7908935546875, "rewards/margins": 1.5614043474197388, "rewards/rejected": -6.352298259735107, "step": 8120 }, { "epoch": 0.53, "grad_norm": 15.875, "learning_rate": 2.657500608637448e-06, "logits/chosen": -1.5198818445205688, "logits/rejected": -1.3541873693466187, "logps/chosen": -640.6867065429688, "logps/rejected": -873.3805541992188, "loss": 0.3377, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.705185651779175, "rewards/margins": 2.3526089191436768, "rewards/rejected": -6.05779504776001, "step": 8130 }, { "epoch": 0.53, "grad_norm": 32.25, "learning_rate": 2.6518012207710595e-06, "logits/chosen": -1.3339468240737915, "logits/rejected": -0.9314252734184265, "logps/chosen": -661.774169921875, "logps/rejected": -784.5595703125, "loss": 0.5627, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.1874680519104, "rewards/margins": 1.6462504863739014, "rewards/rejected": -5.833718299865723, "step": 8140 }, { "epoch": 0.53, "grad_norm": 6.25, "learning_rate": 2.646101040919991e-06, "logits/chosen": -1.453536033630371, "logits/rejected": -1.0298473834991455, "logps/chosen": -696.450439453125, "logps/rejected": -850.93798828125, "loss": 0.5354, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.177577018737793, "rewards/margins": 1.7299931049346924, "rewards/rejected": -5.9075703620910645, "step": 8150 }, { "epoch": 0.53, "grad_norm": 108.0, "learning_rate": 2.640400098823498e-06, "logits/chosen": -1.7782577276229858, "logits/rejected": -1.1781136989593506, "logps/chosen": -760.330078125, "logps/rejected": -833.7664184570312, "loss": 0.7084, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.396557807922363, "rewards/margins": 1.637887954711914, "rewards/rejected": -6.034445762634277, "step": 8160 }, { "epoch": 0.53, "grad_norm": 15.25, "learning_rate": 2.63469842422481e-06, "logits/chosen": -1.449829339981079, "logits/rejected": -1.2080628871917725, "logps/chosen": -662.307861328125, "logps/rejected": -770.8550415039062, "loss": 0.5119, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6456217765808105, "rewards/margins": 1.418807864189148, "rewards/rejected": -5.064429759979248, "step": 8170 }, { "epoch": 0.54, "grad_norm": 35.5, "learning_rate": 2.6289960468709803e-06, "logits/chosen": -1.282387137413025, "logits/rejected": -0.9779027700424194, "logps/chosen": -672.42236328125, "logps/rejected": -755.41943359375, "loss": 0.5643, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.903532028198242, "rewards/margins": 1.795692801475525, "rewards/rejected": -5.699224948883057, "step": 8180 }, { "epoch": 0.54, "grad_norm": 8.125, "learning_rate": 2.6232929965127285e-06, "logits/chosen": -1.4180303812026978, "logits/rejected": -0.5423529148101807, "logps/chosen": -672.651611328125, "logps/rejected": -793.2083129882812, "loss": 0.5854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7229480743408203, "rewards/margins": 1.7211631536483765, "rewards/rejected": -5.4441118240356445, "step": 8190 }, { "epoch": 0.54, "grad_norm": 24.75, "learning_rate": 2.617589302904285e-06, "logits/chosen": -1.4851610660552979, "logits/rejected": -1.2707717418670654, "logps/chosen": -639.7072143554688, "logps/rejected": -797.170654296875, "loss": 0.3469, "rewards/accuracies": 0.875, "rewards/chosen": -3.7552237510681152, "rewards/margins": 1.9993658065795898, "rewards/rejected": -5.754590034484863, "step": 8200 }, { "epoch": 0.54, "eval_logits/chosen": -1.4046506881713867, "eval_logits/rejected": -1.0938910245895386, "eval_logps/chosen": -695.155517578125, "eval_logps/rejected": -838.6585083007812, "eval_loss": 0.529402494430542, "eval_rewards/accuracies": 0.7360000014305115, "eval_rewards/chosen": -4.305358409881592, "eval_rewards/margins": 1.6355096101760864, "eval_rewards/rejected": -5.940868377685547, "eval_runtime": 1082.9577, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 8200 }, { "epoch": 0.54, "grad_norm": 67.5, "learning_rate": 2.611884995803236e-06, "logits/chosen": -1.2090909481048584, "logits/rejected": -0.773546576499939, "logps/chosen": -672.5487060546875, "logps/rejected": -821.1400146484375, "loss": 0.4929, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.3934831619262695, "rewards/margins": 1.4755960702896118, "rewards/rejected": -5.86907958984375, "step": 8210 }, { "epoch": 0.54, "grad_norm": 10.4375, "learning_rate": 2.6061801049703694e-06, "logits/chosen": -1.2500954866409302, "logits/rejected": -1.0217788219451904, "logps/chosen": -697.799072265625, "logps/rejected": -848.2398681640625, "loss": 0.4793, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.217865943908691, "rewards/margins": 1.5705519914627075, "rewards/rejected": -5.788417816162109, "step": 8220 }, { "epoch": 0.54, "grad_norm": 14.625, "learning_rate": 2.6004746601695175e-06, "logits/chosen": -1.3860528469085693, "logits/rejected": -0.899990439414978, "logps/chosen": -664.425048828125, "logps/rejected": -846.7745971679688, "loss": 0.3616, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.607015132904053, "rewards/margins": 1.7028589248657227, "rewards/rejected": -6.309874534606934, "step": 8230 }, { "epoch": 0.54, "grad_norm": 29.25, "learning_rate": 2.5947686911674027e-06, "logits/chosen": -1.528558373451233, "logits/rejected": -1.2478182315826416, "logps/chosen": -691.2342529296875, "logps/rejected": -853.4295654296875, "loss": 0.3818, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.119698524475098, "rewards/margins": 1.9977786540985107, "rewards/rejected": -6.117477893829346, "step": 8240 }, { "epoch": 0.54, "grad_norm": 16.375, "learning_rate": 2.5890622277334833e-06, "logits/chosen": -1.312318205833435, "logits/rejected": -0.7233628034591675, "logps/chosen": -725.2612915039062, "logps/rejected": -865.37890625, "loss": 0.4634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.428563117980957, "rewards/margins": 1.9727401733398438, "rewards/rejected": -6.401303291320801, "step": 8250 }, { "epoch": 0.54, "grad_norm": 31.125, "learning_rate": 2.5833552996397964e-06, "logits/chosen": -1.3561217784881592, "logits/rejected": -1.2873425483703613, "logps/chosen": -740.9383544921875, "logps/rejected": -927.3157958984375, "loss": 0.481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.745497226715088, "rewards/margins": 1.7488653659820557, "rewards/rejected": -6.494362831115723, "step": 8260 }, { "epoch": 0.54, "grad_norm": 9.8125, "learning_rate": 2.577647936660804e-06, "logits/chosen": -1.5383434295654297, "logits/rejected": -1.310014009475708, "logps/chosen": -677.870361328125, "logps/rejected": -854.2849731445312, "loss": 0.4403, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7463321685791016, "rewards/margins": 1.8394863605499268, "rewards/rejected": -5.585817813873291, "step": 8270 }, { "epoch": 0.54, "grad_norm": 26.0, "learning_rate": 2.571940168573237e-06, "logits/chosen": -1.2331101894378662, "logits/rejected": -0.8981884121894836, "logps/chosen": -670.4237670898438, "logps/rejected": -734.097900390625, "loss": 0.5431, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.197335720062256, "rewards/margins": 1.1765263080596924, "rewards/rejected": -5.373861789703369, "step": 8280 }, { "epoch": 0.54, "grad_norm": 8.6875, "learning_rate": 2.566232025155938e-06, "logits/chosen": -0.9489194750785828, "logits/rejected": -1.184788703918457, "logps/chosen": -654.6783447265625, "logps/rejected": -834.8394775390625, "loss": 0.4762, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.230862617492676, "rewards/margins": 1.9186365604400635, "rewards/rejected": -6.14949893951416, "step": 8290 }, { "epoch": 0.54, "grad_norm": 107.0, "learning_rate": 2.5605235361897105e-06, "logits/chosen": -1.4068844318389893, "logits/rejected": -0.9613475799560547, "logps/chosen": -751.3697509765625, "logps/rejected": -828.9688720703125, "loss": 0.6583, "rewards/accuracies": 0.625, "rewards/chosen": -4.763547897338867, "rewards/margins": 0.8105732798576355, "rewards/rejected": -5.574120998382568, "step": 8300 }, { "epoch": 0.54, "eval_logits/chosen": -1.2945964336395264, "eval_logits/rejected": -0.9177000522613525, "eval_logps/chosen": -724.0404663085938, "eval_logps/rejected": -876.1429443359375, "eval_loss": 0.5330018401145935, "eval_rewards/accuracies": 0.7425000071525574, "eval_rewards/chosen": -4.594208240509033, "eval_rewards/margins": 1.7215044498443604, "eval_rewards/rejected": -6.315712928771973, "eval_runtime": 1082.7828, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 8300 }, { "epoch": 0.54, "grad_norm": 50.0, "learning_rate": 2.5548147314571596e-06, "logits/chosen": -1.5618624687194824, "logits/rejected": -1.5080831050872803, "logps/chosen": -697.51611328125, "logps/rejected": -792.1029663085938, "loss": 0.6531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.252680778503418, "rewards/margins": 1.1798148155212402, "rewards/rejected": -5.432496070861816, "step": 8310 }, { "epoch": 0.54, "grad_norm": 21.75, "learning_rate": 2.549105640742537e-06, "logits/chosen": -1.5498484373092651, "logits/rejected": -1.2533833980560303, "logps/chosen": -636.6640014648438, "logps/rejected": -757.121337890625, "loss": 0.7238, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.934778928756714, "rewards/margins": 1.0476784706115723, "rewards/rejected": -4.982457637786865, "step": 8320 }, { "epoch": 0.55, "grad_norm": 10.4375, "learning_rate": 2.543396293831588e-06, "logits/chosen": -1.4631760120391846, "logits/rejected": -0.34479814767837524, "logps/chosen": -634.5352783203125, "logps/rejected": -778.4528198242188, "loss": 0.4311, "rewards/accuracies": 0.75, "rewards/chosen": -4.031635761260986, "rewards/margins": 1.9084018468856812, "rewards/rejected": -5.940037250518799, "step": 8330 }, { "epoch": 0.55, "grad_norm": 12.3125, "learning_rate": 2.5376867205113927e-06, "logits/chosen": -1.1158607006072998, "logits/rejected": -0.937054455280304, "logps/chosen": -679.3633422851562, "logps/rejected": -817.6707153320312, "loss": 0.6543, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.4779863357543945, "rewards/margins": 1.0260884761810303, "rewards/rejected": -5.504074573516846, "step": 8340 }, { "epoch": 0.55, "grad_norm": 30.25, "learning_rate": 2.531976950570214e-06, "logits/chosen": -1.37745201587677, "logits/rejected": -0.864303708076477, "logps/chosen": -708.8817138671875, "logps/rejected": -826.9033203125, "loss": 0.6936, "rewards/accuracies": 0.75, "rewards/chosen": -4.685024261474609, "rewards/margins": 1.5368804931640625, "rewards/rejected": -6.221904754638672, "step": 8350 }, { "epoch": 0.55, "grad_norm": 36.5, "learning_rate": 2.5262670137973413e-06, "logits/chosen": -1.816552758216858, "logits/rejected": -1.633056640625, "logps/chosen": -700.42919921875, "logps/rejected": -801.937255859375, "loss": 0.5832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.08888578414917, "rewards/margins": 1.0672401189804077, "rewards/rejected": -5.156126022338867, "step": 8360 }, { "epoch": 0.55, "grad_norm": 28.875, "learning_rate": 2.520556939982931e-06, "logits/chosen": -1.1280368566513062, "logits/rejected": -1.2861627340316772, "logps/chosen": -643.26123046875, "logps/rejected": -751.5245361328125, "loss": 0.5266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.123172760009766, "rewards/margins": 1.0250604152679443, "rewards/rejected": -5.148233413696289, "step": 8370 }, { "epoch": 0.55, "grad_norm": 17.125, "learning_rate": 2.5148467589178593e-06, "logits/chosen": -1.6376216411590576, "logits/rejected": -0.9720862507820129, "logps/chosen": -611.5180053710938, "logps/rejected": -748.3888549804688, "loss": 0.3957, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.540008544921875, "rewards/margins": 1.8697335720062256, "rewards/rejected": -5.40974235534668, "step": 8380 }, { "epoch": 0.55, "grad_norm": 29.125, "learning_rate": 2.509136500393557e-06, "logits/chosen": -1.1364970207214355, "logits/rejected": -1.6248633861541748, "logps/chosen": -696.622314453125, "logps/rejected": -923.5265502929688, "loss": 0.4607, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.164916038513184, "rewards/margins": 1.8808122873306274, "rewards/rejected": -6.0457282066345215, "step": 8390 }, { "epoch": 0.55, "grad_norm": 12.5625, "learning_rate": 2.5034261942018627e-06, "logits/chosen": -1.0822927951812744, "logits/rejected": -1.307224988937378, "logps/chosen": -584.4113159179688, "logps/rejected": -753.5318603515625, "loss": 0.3581, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.7284138202667236, "rewards/margins": 1.8831422328948975, "rewards/rejected": -5.611556053161621, "step": 8400 }, { "epoch": 0.55, "eval_logits/chosen": -1.387689471244812, "eval_logits/rejected": -1.0402836799621582, "eval_logps/chosen": -707.3421020507812, "eval_logps/rejected": -855.9658813476562, "eval_loss": 0.5290474891662598, "eval_rewards/accuracies": 0.7429999709129333, "eval_rewards/chosen": -4.427225112915039, "eval_rewards/margins": 1.686716914176941, "eval_rewards/rejected": -6.1139421463012695, "eval_runtime": 1082.9613, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 8400 }, { "epoch": 0.55, "grad_norm": 51.25, "learning_rate": 2.497715870134863e-06, "logits/chosen": -1.2431710958480835, "logits/rejected": -0.6773529052734375, "logps/chosen": -632.2696533203125, "logps/rejected": -780.3651123046875, "loss": 0.533, "rewards/accuracies": 0.75, "rewards/chosen": -4.2093915939331055, "rewards/margins": 1.6615968942642212, "rewards/rejected": -5.870987892150879, "step": 8410 }, { "epoch": 0.55, "grad_norm": 10.5625, "learning_rate": 2.492005557984735e-06, "logits/chosen": -1.0662009716033936, "logits/rejected": -0.8960220217704773, "logps/chosen": -727.3512573242188, "logps/rejected": -864.7152099609375, "loss": 0.5515, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.414191722869873, "rewards/margins": 1.7178767919540405, "rewards/rejected": -6.132069110870361, "step": 8420 }, { "epoch": 0.55, "grad_norm": 103.5, "learning_rate": 2.4862952875435976e-06, "logits/chosen": -1.4846833944320679, "logits/rejected": -0.998816967010498, "logps/chosen": -725.5767822265625, "logps/rejected": -873.3270263671875, "loss": 0.5092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.583536624908447, "rewards/margins": 1.7174094915390015, "rewards/rejected": -6.300946235656738, "step": 8430 }, { "epoch": 0.55, "grad_norm": 39.0, "learning_rate": 2.4805850886033493e-06, "logits/chosen": -1.3239281177520752, "logits/rejected": -1.1426351070404053, "logps/chosen": -618.4832763671875, "logps/rejected": -805.0882568359375, "loss": 0.4471, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.113088607788086, "rewards/margins": 1.7559947967529297, "rewards/rejected": -5.869083881378174, "step": 8440 }, { "epoch": 0.55, "grad_norm": 11.125, "learning_rate": 2.4748749909555175e-06, "logits/chosen": -1.3142292499542236, "logits/rejected": -0.9820087552070618, "logps/chosen": -641.9256591796875, "logps/rejected": -839.6998291015625, "loss": 0.3467, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.9989819526672363, "rewards/margins": 2.1957006454467773, "rewards/rejected": -6.1946821212768555, "step": 8450 }, { "epoch": 0.55, "grad_norm": 30.75, "learning_rate": 2.469165024391099e-06, "logits/chosen": -1.5505149364471436, "logits/rejected": -0.8935020565986633, "logps/chosen": -702.2338256835938, "logps/rejected": -766.9478149414062, "loss": 0.5654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.198131084442139, "rewards/margins": 1.080320119857788, "rewards/rejected": -5.278450965881348, "step": 8460 }, { "epoch": 0.55, "grad_norm": 52.0, "learning_rate": 2.4634552187004088e-06, "logits/chosen": -1.4478824138641357, "logits/rejected": -0.7905007004737854, "logps/chosen": -691.4960327148438, "logps/rejected": -852.5149536132812, "loss": 0.4626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.907679796218872, "rewards/margins": 1.7279293537139893, "rewards/rejected": -5.635609149932861, "step": 8470 }, { "epoch": 0.55, "grad_norm": 35.0, "learning_rate": 2.4577456036729212e-06, "logits/chosen": -1.2219147682189941, "logits/rejected": -1.0615158081054688, "logps/chosen": -641.5470581054688, "logps/rejected": -783.1008911132812, "loss": 0.7137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.238812446594238, "rewards/margins": 1.2901554107666016, "rewards/rejected": -5.52896785736084, "step": 8480 }, { "epoch": 0.56, "grad_norm": 45.0, "learning_rate": 2.452036209097116e-06, "logits/chosen": -1.362618327140808, "logits/rejected": -1.133803129196167, "logps/chosen": -686.873779296875, "logps/rejected": -844.2684326171875, "loss": 0.4375, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.108596324920654, "rewards/margins": 1.733668565750122, "rewards/rejected": -5.842264652252197, "step": 8490 }, { "epoch": 0.56, "grad_norm": 10.75, "learning_rate": 2.4463270647603236e-06, "logits/chosen": -1.0714175701141357, "logits/rejected": -0.720252275466919, "logps/chosen": -638.1339111328125, "logps/rejected": -818.6375732421875, "loss": 0.4143, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.213076591491699, "rewards/margins": 1.8780252933502197, "rewards/rejected": -6.09110164642334, "step": 8500 }, { "epoch": 0.56, "eval_logits/chosen": -1.3601245880126953, "eval_logits/rejected": -0.9932692646980286, "eval_logps/chosen": -685.41162109375, "eval_logps/rejected": -838.3191528320312, "eval_loss": 0.5270918011665344, "eval_rewards/accuracies": 0.7505000233650208, "eval_rewards/chosen": -4.207921028137207, "eval_rewards/margins": 1.7295540571212769, "eval_rewards/rejected": -5.937474250793457, "eval_runtime": 1082.6652, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 8500 }, { "epoch": 0.56, "grad_norm": 20.125, "learning_rate": 2.440618200448568e-06, "logits/chosen": -1.325921654701233, "logits/rejected": -1.3798624277114868, "logps/chosen": -703.6557006835938, "logps/rejected": -876.6257934570312, "loss": 0.3944, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.171205043792725, "rewards/margins": 1.8561408519744873, "rewards/rejected": -6.027346134185791, "step": 8510 }, { "epoch": 0.56, "grad_norm": 34.75, "learning_rate": 2.4349096459464127e-06, "logits/chosen": -1.1948151588439941, "logits/rejected": -0.9158404469490051, "logps/chosen": -677.0951538085938, "logps/rejected": -865.8527221679688, "loss": 0.3765, "rewards/accuracies": 0.875, "rewards/chosen": -4.429177284240723, "rewards/margins": 2.1494603157043457, "rewards/rejected": -6.57863712310791, "step": 8520 }, { "epoch": 0.56, "grad_norm": 45.25, "learning_rate": 2.429201431036803e-06, "logits/chosen": -1.1939901113510132, "logits/rejected": -1.1580101251602173, "logps/chosen": -654.6717529296875, "logps/rejected": -842.4515380859375, "loss": 0.5616, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.330679893493652, "rewards/margins": 1.3604307174682617, "rewards/rejected": -5.691111087799072, "step": 8530 }, { "epoch": 0.56, "grad_norm": 28.625, "learning_rate": 2.423493585500917e-06, "logits/chosen": -1.4993419647216797, "logits/rejected": -0.7261701822280884, "logps/chosen": -716.4431762695312, "logps/rejected": -793.9991455078125, "loss": 0.5313, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.254256248474121, "rewards/margins": 1.4344720840454102, "rewards/rejected": -5.688729286193848, "step": 8540 }, { "epoch": 0.56, "grad_norm": 33.5, "learning_rate": 2.4177861391180016e-06, "logits/chosen": -1.5926620960235596, "logits/rejected": -1.3574860095977783, "logps/chosen": -742.94482421875, "logps/rejected": -788.5765380859375, "loss": 0.5943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.429677963256836, "rewards/margins": 1.189591646194458, "rewards/rejected": -5.619269371032715, "step": 8550 }, { "epoch": 0.56, "grad_norm": 9.1875, "learning_rate": 2.412079121665222e-06, "logits/chosen": -1.3858203887939453, "logits/rejected": -0.9652866125106812, "logps/chosen": -663.4866943359375, "logps/rejected": -840.6754150390625, "loss": 0.4543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7354683876037598, "rewards/margins": 1.9580118656158447, "rewards/rejected": -5.693480491638184, "step": 8560 }, { "epoch": 0.56, "grad_norm": 25.75, "learning_rate": 2.4063725629175062e-06, "logits/chosen": -1.478569746017456, "logits/rejected": -1.2530734539031982, "logps/chosen": -598.4396362304688, "logps/rejected": -780.3701171875, "loss": 0.5513, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.524883985519409, "rewards/margins": 1.8823976516723633, "rewards/rejected": -5.407281875610352, "step": 8570 }, { "epoch": 0.56, "grad_norm": 19.25, "learning_rate": 2.4006664926473896e-06, "logits/chosen": -1.622981309890747, "logits/rejected": -1.0355640649795532, "logps/chosen": -578.5248413085938, "logps/rejected": -673.4351196289062, "loss": 0.5081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1032838821411133, "rewards/margins": 1.5864107608795166, "rewards/rejected": -4.689694404602051, "step": 8580 }, { "epoch": 0.56, "grad_norm": 4.5, "learning_rate": 2.3949609406248576e-06, "logits/chosen": -1.4226912260055542, "logits/rejected": -1.0714054107666016, "logps/chosen": -512.7940673828125, "logps/rejected": -665.2630615234375, "loss": 0.4122, "rewards/accuracies": 0.75, "rewards/chosen": -3.335524082183838, "rewards/margins": 1.5525932312011719, "rewards/rejected": -4.88811731338501, "step": 8590 }, { "epoch": 0.56, "grad_norm": 45.75, "learning_rate": 2.3892559366171922e-06, "logits/chosen": -1.2024462223052979, "logits/rejected": -1.4201934337615967, "logps/chosen": -638.9749755859375, "logps/rejected": -783.408203125, "loss": 0.6205, "rewards/accuracies": 0.625, "rewards/chosen": -3.842050552368164, "rewards/margins": 1.6902132034301758, "rewards/rejected": -5.53226375579834, "step": 8600 }, { "epoch": 0.56, "eval_logits/chosen": -1.429022192955017, "eval_logits/rejected": -1.067404866218567, "eval_logps/chosen": -662.8466186523438, "eval_logps/rejected": -823.1312866210938, "eval_loss": 0.5300338268280029, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -3.9822700023651123, "eval_rewards/margins": 1.8033267259597778, "eval_rewards/rejected": -5.785597324371338, "eval_runtime": 1082.9733, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 8600 }, { "epoch": 0.56, "grad_norm": 11.3125, "learning_rate": 2.3835515103888173e-06, "logits/chosen": -1.5714504718780518, "logits/rejected": -0.7915675044059753, "logps/chosen": -614.689208984375, "logps/rejected": -799.8009033203125, "loss": 0.3701, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5360045433044434, "rewards/margins": 2.098367214202881, "rewards/rejected": -5.634371757507324, "step": 8610 }, { "epoch": 0.56, "grad_norm": 21.75, "learning_rate": 2.377847691701141e-06, "logits/chosen": -1.5732303857803345, "logits/rejected": -1.0825724601745605, "logps/chosen": -744.1302490234375, "logps/rejected": -868.7034301757812, "loss": 0.6038, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.492016792297363, "rewards/margins": 1.5674570798873901, "rewards/rejected": -6.059473514556885, "step": 8620 }, { "epoch": 0.56, "grad_norm": 27.625, "learning_rate": 2.372144510312403e-06, "logits/chosen": -1.6502927541732788, "logits/rejected": -1.2980979681015015, "logps/chosen": -710.6804809570312, "logps/rejected": -914.0060424804688, "loss": 0.5167, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.486581325531006, "rewards/margins": 1.9669443368911743, "rewards/rejected": -6.453525543212891, "step": 8630 }, { "epoch": 0.57, "grad_norm": 34.25, "learning_rate": 2.366441995977516e-06, "logits/chosen": -1.5177066326141357, "logits/rejected": -1.3701558113098145, "logps/chosen": -727.0242309570312, "logps/rejected": -878.7013549804688, "loss": 0.439, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.875014543533325, "rewards/margins": 2.112377405166626, "rewards/rejected": -5.987391471862793, "step": 8640 }, { "epoch": 0.57, "grad_norm": 16.625, "learning_rate": 2.3607401784479165e-06, "logits/chosen": -1.3622164726257324, "logits/rejected": -1.1867282390594482, "logps/chosen": -677.0134887695312, "logps/rejected": -837.6334838867188, "loss": 0.4746, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.184944152832031, "rewards/margins": 1.8259378671646118, "rewards/rejected": -6.010882377624512, "step": 8650 }, { "epoch": 0.57, "grad_norm": 17.875, "learning_rate": 2.355039087471401e-06, "logits/chosen": -1.22259521484375, "logits/rejected": -0.7646195292472839, "logps/chosen": -733.0143432617188, "logps/rejected": -954.0257568359375, "loss": 0.2859, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.289002895355225, "rewards/margins": 2.335888385772705, "rewards/rejected": -6.624891757965088, "step": 8660 }, { "epoch": 0.57, "grad_norm": 25.25, "learning_rate": 2.349338752791978e-06, "logits/chosen": -1.223301887512207, "logits/rejected": -0.7951595187187195, "logps/chosen": -672.01904296875, "logps/rejected": -889.2867431640625, "loss": 0.3672, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.075139045715332, "rewards/margins": 2.9197914600372314, "rewards/rejected": -6.994929313659668, "step": 8670 }, { "epoch": 0.57, "grad_norm": 7.25, "learning_rate": 2.3436392041497095e-06, "logits/chosen": -1.182023286819458, "logits/rejected": -0.9708463549613953, "logps/chosen": -730.480712890625, "logps/rejected": -871.4420776367188, "loss": 0.7577, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.795408248901367, "rewards/margins": 1.558915376663208, "rewards/rejected": -6.354323387145996, "step": 8680 }, { "epoch": 0.57, "grad_norm": 5.34375, "learning_rate": 2.337940471280557e-06, "logits/chosen": -1.4357311725616455, "logits/rejected": -0.782059371471405, "logps/chosen": -754.4589233398438, "logps/rejected": -866.4690551757812, "loss": 0.8077, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.491460800170898, "rewards/margins": 1.6085491180419922, "rewards/rejected": -6.100009441375732, "step": 8690 }, { "epoch": 0.57, "grad_norm": 16.375, "learning_rate": 2.3322425839162252e-06, "logits/chosen": -1.4589821100234985, "logits/rejected": -0.9568243026733398, "logps/chosen": -578.43798828125, "logps/rejected": -751.6834716796875, "loss": 0.5613, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.399801731109619, "rewards/margins": 1.4623279571533203, "rewards/rejected": -4.862129211425781, "step": 8700 }, { "epoch": 0.57, "eval_logits/chosen": -1.46004056930542, "eval_logits/rejected": -1.0772278308868408, "eval_logps/chosen": -629.4801025390625, "eval_logps/rejected": -791.0134887695312, "eval_loss": 0.5370397567749023, "eval_rewards/accuracies": 0.7404999732971191, "eval_rewards/chosen": -3.648604393005371, "eval_rewards/margins": 1.815813422203064, "eval_rewards/rejected": -5.464417934417725, "eval_runtime": 1082.2825, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 8700 }, { "epoch": 0.57, "grad_norm": 16.25, "learning_rate": 2.326545571784008e-06, "logits/chosen": -1.5362566709518433, "logits/rejected": -1.208397626876831, "logps/chosen": -622.8265380859375, "logps/rejected": -791.5584716796875, "loss": 0.5171, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7001748085021973, "rewards/margins": 1.7112934589385986, "rewards/rejected": -5.411467552185059, "step": 8710 }, { "epoch": 0.57, "grad_norm": 22.625, "learning_rate": 2.3208494646066333e-06, "logits/chosen": -1.54216468334198, "logits/rejected": -1.1778007745742798, "logps/chosen": -573.1766357421875, "logps/rejected": -718.2933349609375, "loss": 0.6435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4819724559783936, "rewards/margins": 1.441941499710083, "rewards/rejected": -4.923913478851318, "step": 8720 }, { "epoch": 0.57, "grad_norm": 10.5625, "learning_rate": 2.3151542921021074e-06, "logits/chosen": -1.6029781103134155, "logits/rejected": -0.9410561323165894, "logps/chosen": -568.5474853515625, "logps/rejected": -736.5467529296875, "loss": 0.3532, "rewards/accuracies": 0.875, "rewards/chosen": -2.8595499992370605, "rewards/margins": 2.1266636848449707, "rewards/rejected": -4.986213684082031, "step": 8730 }, { "epoch": 0.57, "grad_norm": 2.671875, "learning_rate": 2.3094600839835592e-06, "logits/chosen": -1.739577293395996, "logits/rejected": -1.3979501724243164, "logps/chosen": -592.3319702148438, "logps/rejected": -807.5262451171875, "loss": 0.6321, "rewards/accuracies": 0.75, "rewards/chosen": -3.239413022994995, "rewards/margins": 1.9183158874511719, "rewards/rejected": -5.157729148864746, "step": 8740 }, { "epoch": 0.57, "grad_norm": 23.25, "learning_rate": 2.3037668699590864e-06, "logits/chosen": -1.3453651666641235, "logits/rejected": -1.094515085220337, "logps/chosen": -530.8018798828125, "logps/rejected": -766.175537109375, "loss": 0.4123, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.007462978363037, "rewards/margins": 2.5085318088531494, "rewards/rejected": -5.515994548797607, "step": 8750 }, { "epoch": 0.57, "grad_norm": 45.75, "learning_rate": 2.2980746797316038e-06, "logits/chosen": -1.804081916809082, "logits/rejected": -0.9202302098274231, "logps/chosen": -625.5728759765625, "logps/rejected": -764.26708984375, "loss": 0.4199, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0844779014587402, "rewards/margins": 2.0823817253112793, "rewards/rejected": -5.1668596267700195, "step": 8760 }, { "epoch": 0.57, "grad_norm": 5.09375, "learning_rate": 2.2923835429986795e-06, "logits/chosen": -1.4056782722473145, "logits/rejected": -1.0573246479034424, "logps/chosen": -519.5914306640625, "logps/rejected": -626.1264038085938, "loss": 0.563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9310266971588135, "rewards/margins": 1.3666692972183228, "rewards/rejected": -4.297696113586426, "step": 8770 }, { "epoch": 0.57, "grad_norm": 9.125, "learning_rate": 2.286693489452389e-06, "logits/chosen": -1.1586670875549316, "logits/rejected": -1.0320886373519897, "logps/chosen": -547.344970703125, "logps/rejected": -843.2555541992188, "loss": 0.3745, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.26538348197937, "rewards/margins": 2.9982824325561523, "rewards/rejected": -6.26366662979126, "step": 8780 }, { "epoch": 0.58, "grad_norm": 123.5, "learning_rate": 2.2810045487791545e-06, "logits/chosen": -1.6320135593414307, "logits/rejected": -1.0514600276947021, "logps/chosen": -504.8511657714844, "logps/rejected": -701.0399169921875, "loss": 0.4514, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.062401294708252, "rewards/margins": 1.87258780002594, "rewards/rejected": -4.934988975524902, "step": 8790 }, { "epoch": 0.58, "grad_norm": 26.0, "learning_rate": 2.275316750659592e-06, "logits/chosen": -1.2856196165084839, "logits/rejected": -0.8280979990959167, "logps/chosen": -636.4570922851562, "logps/rejected": -893.4278564453125, "loss": 0.3026, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.791612148284912, "rewards/margins": 2.7072112560272217, "rewards/rejected": -6.4988226890563965, "step": 8800 }, { "epoch": 0.58, "eval_logits/chosen": -1.3582509756088257, "eval_logits/rejected": -0.9434436559677124, "eval_logps/chosen": -676.4411010742188, "eval_logps/rejected": -844.5537719726562, "eval_loss": 0.5405257344245911, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -4.118215560913086, "eval_rewards/margins": 1.8816055059432983, "eval_rewards/rejected": -5.999820709228516, "eval_runtime": 1082.1911, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 8800 }, { "epoch": 0.58, "grad_norm": 16.5, "learning_rate": 2.2696301247683587e-06, "logits/chosen": -1.3424618244171143, "logits/rejected": -1.0806810855865479, "logps/chosen": -665.5404052734375, "logps/rejected": -934.95703125, "loss": 0.4114, "rewards/accuracies": 0.875, "rewards/chosen": -3.9406962394714355, "rewards/margins": 2.3215556144714355, "rewards/rejected": -6.262252330780029, "step": 8810 }, { "epoch": 0.58, "grad_norm": 34.0, "learning_rate": 2.2639447007739933e-06, "logits/chosen": -1.4920878410339355, "logits/rejected": -0.8124421238899231, "logps/chosen": -674.9412841796875, "logps/rejected": -884.0438232421875, "loss": 0.4665, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.151027679443359, "rewards/margins": 2.4276182651519775, "rewards/rejected": -6.5786452293396, "step": 8820 }, { "epoch": 0.58, "grad_norm": 76.5, "learning_rate": 2.258260508338767e-06, "logits/chosen": -1.3500337600708008, "logits/rejected": -0.8095887899398804, "logps/chosen": -778.0701904296875, "logps/rejected": -852.9625244140625, "loss": 0.9375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.714651584625244, "rewards/margins": 1.0577243566513062, "rewards/rejected": -5.77237606048584, "step": 8830 }, { "epoch": 0.58, "grad_norm": 10.875, "learning_rate": 2.2525775771185216e-06, "logits/chosen": -1.2423272132873535, "logits/rejected": -1.1019871234893799, "logps/chosen": -632.0445556640625, "logps/rejected": -951.9937744140625, "loss": 0.4079, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.031198501586914, "rewards/margins": 2.4285261631011963, "rewards/rejected": -6.459724426269531, "step": 8840 }, { "epoch": 0.58, "grad_norm": 29.625, "learning_rate": 2.2468959367625225e-06, "logits/chosen": -1.329973816871643, "logits/rejected": -1.0443298816680908, "logps/chosen": -657.650146484375, "logps/rejected": -846.4210815429688, "loss": 0.6106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.935532808303833, "rewards/margins": 1.961175560951233, "rewards/rejected": -5.8967084884643555, "step": 8850 }, { "epoch": 0.58, "grad_norm": 35.25, "learning_rate": 2.2412156169132977e-06, "logits/chosen": -1.4136470556259155, "logits/rejected": -1.02044677734375, "logps/chosen": -491.96478271484375, "logps/rejected": -664.4104614257812, "loss": 0.4851, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.946263074874878, "rewards/margins": 1.6036627292633057, "rewards/rejected": -4.549925327301025, "step": 8860 }, { "epoch": 0.58, "grad_norm": 21.875, "learning_rate": 2.2355366472064885e-06, "logits/chosen": -1.3482542037963867, "logits/rejected": -1.2779749631881714, "logps/chosen": -625.4310302734375, "logps/rejected": -641.9002075195312, "loss": 0.6997, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.566612958908081, "rewards/margins": 0.9158374667167664, "rewards/rejected": -4.482450008392334, "step": 8870 }, { "epoch": 0.58, "grad_norm": 7.8125, "learning_rate": 2.22985905727069e-06, "logits/chosen": -1.651577353477478, "logits/rejected": -0.4403495192527771, "logps/chosen": -584.1673583984375, "logps/rejected": -697.8440551757812, "loss": 0.4211, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2909111976623535, "rewards/margins": 1.9476287364959717, "rewards/rejected": -5.238539695739746, "step": 8880 }, { "epoch": 0.58, "grad_norm": 51.75, "learning_rate": 2.2241828767272993e-06, "logits/chosen": -1.2605105638504028, "logits/rejected": -0.7368279695510864, "logps/chosen": -578.7516479492188, "logps/rejected": -698.5487060546875, "loss": 0.8628, "rewards/accuracies": 0.625, "rewards/chosen": -4.026963233947754, "rewards/margins": 1.151451826095581, "rewards/rejected": -5.178415298461914, "step": 8890 }, { "epoch": 0.58, "grad_norm": 31.0, "learning_rate": 2.21850813519036e-06, "logits/chosen": -1.6335222721099854, "logits/rejected": -1.074105143547058, "logps/chosen": -599.6983642578125, "logps/rejected": -747.3090209960938, "loss": 0.6241, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.301395893096924, "rewards/margins": 1.7239574193954468, "rewards/rejected": -5.02535343170166, "step": 8900 }, { "epoch": 0.58, "eval_logits/chosen": -1.473730444908142, "eval_logits/rejected": -1.069178819656372, "eval_logps/chosen": -618.9297485351562, "eval_logps/rejected": -768.873046875, "eval_loss": 0.5260859131813049, "eval_rewards/accuracies": 0.7415000200271606, "eval_rewards/chosen": -3.5431010723114014, "eval_rewards/margins": 1.6999127864837646, "eval_rewards/rejected": -5.243014812469482, "eval_runtime": 1082.1926, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 8900 }, { "epoch": 0.58, "grad_norm": 21.375, "learning_rate": 2.212834862266409e-06, "logits/chosen": -1.3180955648422241, "logits/rejected": -1.3086185455322266, "logps/chosen": -542.6383056640625, "logps/rejected": -763.6268310546875, "loss": 0.4166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.105239152908325, "rewards/margins": 2.1373226642608643, "rewards/rejected": -5.242562294006348, "step": 8910 }, { "epoch": 0.58, "grad_norm": 88.0, "learning_rate": 2.2071630875543206e-06, "logits/chosen": -1.3960086107254028, "logits/rejected": -1.2503150701522827, "logps/chosen": -712.3756103515625, "logps/rejected": -869.0198364257812, "loss": 0.6178, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.978764772415161, "rewards/margins": 1.632873296737671, "rewards/rejected": -5.611638069152832, "step": 8920 }, { "epoch": 0.58, "grad_norm": 10.8125, "learning_rate": 2.2014928406451523e-06, "logits/chosen": -1.5825363397598267, "logits/rejected": -1.2258191108703613, "logps/chosen": -645.654052734375, "logps/rejected": -803.6666259765625, "loss": 0.48, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4413135051727295, "rewards/margins": 1.777423620223999, "rewards/rejected": -5.218737602233887, "step": 8930 }, { "epoch": 0.58, "grad_norm": 39.5, "learning_rate": 2.1958241511219904e-06, "logits/chosen": -1.353773593902588, "logits/rejected": -0.7033608555793762, "logps/chosen": -596.0380859375, "logps/rejected": -726.7489013671875, "loss": 0.5028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.656647205352783, "rewards/margins": 1.570371150970459, "rewards/rejected": -5.2270188331604, "step": 8940 }, { "epoch": 0.59, "grad_norm": 13.4375, "learning_rate": 2.1901570485597967e-06, "logits/chosen": -1.6124407052993774, "logits/rejected": -0.8333123922348022, "logps/chosen": -604.83251953125, "logps/rejected": -760.7015380859375, "loss": 0.4662, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4129855632781982, "rewards/margins": 2.158949613571167, "rewards/rejected": -5.571935176849365, "step": 8950 }, { "epoch": 0.59, "grad_norm": 28.5, "learning_rate": 2.184491562525253e-06, "logits/chosen": -1.397979497909546, "logits/rejected": -1.477784276008606, "logps/chosen": -570.2771606445312, "logps/rejected": -747.7635498046875, "loss": 0.6128, "rewards/accuracies": 0.625, "rewards/chosen": -3.3760814666748047, "rewards/margins": 1.561339259147644, "rewards/rejected": -4.937420845031738, "step": 8960 }, { "epoch": 0.59, "grad_norm": 30.375, "learning_rate": 2.178827722576607e-06, "logits/chosen": -1.2040798664093018, "logits/rejected": -1.412222146987915, "logps/chosen": -635.1268310546875, "logps/rejected": -820.4259643554688, "loss": 0.4817, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.51855731010437, "rewards/margins": 1.695092797279358, "rewards/rejected": -5.213650226593018, "step": 8970 }, { "epoch": 0.59, "grad_norm": 45.25, "learning_rate": 2.1731655582635204e-06, "logits/chosen": -1.5145366191864014, "logits/rejected": -0.8772974014282227, "logps/chosen": -685.6612548828125, "logps/rejected": -860.5919799804688, "loss": 0.4783, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.8929240703582764, "rewards/margins": 1.815303087234497, "rewards/rejected": -5.708226680755615, "step": 8980 }, { "epoch": 0.59, "grad_norm": 38.75, "learning_rate": 2.1675050991269096e-06, "logits/chosen": -1.4523141384124756, "logits/rejected": -1.3718873262405396, "logps/chosen": -598.851806640625, "logps/rejected": -719.3466796875, "loss": 0.4582, "rewards/accuracies": 0.75, "rewards/chosen": -3.5503902435302734, "rewards/margins": 1.406653642654419, "rewards/rejected": -4.957043647766113, "step": 8990 }, { "epoch": 0.59, "grad_norm": 21.75, "learning_rate": 2.1618463746987963e-06, "logits/chosen": -1.0113818645477295, "logits/rejected": -0.7274438738822937, "logps/chosen": -626.6242065429688, "logps/rejected": -778.2048950195312, "loss": 0.5426, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.475025177001953, "rewards/margins": 1.8215305805206299, "rewards/rejected": -5.296555519104004, "step": 9000 }, { "epoch": 0.59, "eval_logits/chosen": -1.473477840423584, "eval_logits/rejected": -1.084399938583374, "eval_logps/chosen": -607.385009765625, "eval_logps/rejected": -750.4478759765625, "eval_loss": 0.51231849193573, "eval_rewards/accuracies": 0.7415000200271606, "eval_rewards/chosen": -3.427654266357422, "eval_rewards/margins": 1.6311079263687134, "eval_rewards/rejected": -5.058762073516846, "eval_runtime": 1082.2, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 9000 }, { "epoch": 0.59, "grad_norm": 31.5, "learning_rate": 2.156189414502152e-06, "logits/chosen": -1.755895972251892, "logits/rejected": -0.8271828889846802, "logps/chosen": -582.0782470703125, "logps/rejected": -749.2775268554688, "loss": 0.5532, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.4812004566192627, "rewards/margins": 1.698400855064392, "rewards/rejected": -5.179602146148682, "step": 9010 }, { "epoch": 0.59, "grad_norm": 52.25, "learning_rate": 2.150534248050743e-06, "logits/chosen": -1.6130695343017578, "logits/rejected": -0.9991276860237122, "logps/chosen": -627.7455444335938, "logps/rejected": -720.5574951171875, "loss": 0.3992, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.417351484298706, "rewards/margins": 1.7189054489135742, "rewards/rejected": -5.136257171630859, "step": 9020 }, { "epoch": 0.59, "grad_norm": 14.25, "learning_rate": 2.144880904848978e-06, "logits/chosen": -1.6962497234344482, "logits/rejected": -1.2631187438964844, "logps/chosen": -654.972900390625, "logps/rejected": -807.2998046875, "loss": 0.5138, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.67301869392395, "rewards/margins": 1.596724271774292, "rewards/rejected": -5.2697434425354, "step": 9030 }, { "epoch": 0.59, "grad_norm": 10.0, "learning_rate": 2.1392294143917523e-06, "logits/chosen": -1.6780173778533936, "logits/rejected": -1.3220036029815674, "logps/chosen": -613.8530883789062, "logps/rejected": -689.9335327148438, "loss": 0.5027, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.4427096843719482, "rewards/margins": 1.4514610767364502, "rewards/rejected": -4.894170761108398, "step": 9040 }, { "epoch": 0.59, "grad_norm": 12.1875, "learning_rate": 2.1335798061642956e-06, "logits/chosen": -1.5891972780227661, "logits/rejected": -1.31023108959198, "logps/chosen": -501.76666259765625, "logps/rejected": -728.2667236328125, "loss": 0.351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.800426483154297, "rewards/margins": 2.268376111984253, "rewards/rejected": -5.068802356719971, "step": 9050 }, { "epoch": 0.59, "grad_norm": 38.25, "learning_rate": 2.1279321096420187e-06, "logits/chosen": -1.4695019721984863, "logits/rejected": -0.9618878364562988, "logps/chosen": -672.6768798828125, "logps/rejected": -778.9002685546875, "loss": 0.5921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.827690839767456, "rewards/margins": 1.2924864292144775, "rewards/rejected": -5.120177268981934, "step": 9060 }, { "epoch": 0.59, "grad_norm": 21.75, "learning_rate": 2.122286354290356e-06, "logits/chosen": -1.081133484840393, "logits/rejected": -0.8577007055282593, "logps/chosen": -565.1185302734375, "logps/rejected": -801.2058715820312, "loss": 0.4167, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.5876071453094482, "rewards/margins": 2.1504616737365723, "rewards/rejected": -5.738068580627441, "step": 9070 }, { "epoch": 0.59, "grad_norm": 22.375, "learning_rate": 2.116642569564615e-06, "logits/chosen": -1.3447649478912354, "logits/rejected": -0.8568156361579895, "logps/chosen": -639.5554809570312, "logps/rejected": -884.9255981445312, "loss": 0.4425, "rewards/accuracies": 0.75, "rewards/chosen": -3.5919241905212402, "rewards/margins": 2.532487154006958, "rewards/rejected": -6.124411582946777, "step": 9080 }, { "epoch": 0.59, "grad_norm": 68.0, "learning_rate": 2.1110007849098244e-06, "logits/chosen": -1.3798145055770874, "logits/rejected": -1.0513007640838623, "logps/chosen": -593.0330200195312, "logps/rejected": -726.92041015625, "loss": 0.4907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.478754758834839, "rewards/margins": 1.7172943353652954, "rewards/rejected": -5.196049690246582, "step": 9090 }, { "epoch": 0.6, "grad_norm": 58.25, "learning_rate": 2.105361029760577e-06, "logits/chosen": -1.2629989385604858, "logits/rejected": -0.7031105160713196, "logps/chosen": -692.4131469726562, "logps/rejected": -716.1803588867188, "loss": 0.7459, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.9233508110046387, "rewards/margins": 0.7235302329063416, "rewards/rejected": -4.646881103515625, "step": 9100 }, { "epoch": 0.6, "eval_logits/chosen": -1.4295088052749634, "eval_logits/rejected": -1.0355795621871948, "eval_logps/chosen": -625.3505249023438, "eval_logps/rejected": -763.3653564453125, "eval_loss": 0.5096603631973267, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -3.6073086261749268, "eval_rewards/margins": 1.580628752708435, "eval_rewards/rejected": -5.1879377365112305, "eval_runtime": 1081.8008, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 9100 }, { "epoch": 0.6, "grad_norm": 21.0, "learning_rate": 2.099723333540874e-06, "logits/chosen": -1.6948944330215454, "logits/rejected": -1.5425388813018799, "logps/chosen": -606.5047607421875, "logps/rejected": -666.9666137695312, "loss": 0.6936, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.317941188812256, "rewards/margins": 0.9352173805236816, "rewards/rejected": -4.2531585693359375, "step": 9110 }, { "epoch": 0.6, "grad_norm": 27.0, "learning_rate": 2.094087725663979e-06, "logits/chosen": -1.2598845958709717, "logits/rejected": -0.5171595811843872, "logps/chosen": -576.2872314453125, "logps/rejected": -761.7762451171875, "loss": 0.4967, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3598484992980957, "rewards/margins": 1.9463021755218506, "rewards/rejected": -5.306150913238525, "step": 9120 }, { "epoch": 0.6, "grad_norm": 47.75, "learning_rate": 2.088454235532258e-06, "logits/chosen": -1.3638019561767578, "logits/rejected": -0.89886075258255, "logps/chosen": -561.6173095703125, "logps/rejected": -751.0125122070312, "loss": 0.4997, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2408478260040283, "rewards/margins": 1.8076870441436768, "rewards/rejected": -5.048534870147705, "step": 9130 }, { "epoch": 0.6, "grad_norm": 191.0, "learning_rate": 2.082822892537027e-06, "logits/chosen": -1.265890121459961, "logits/rejected": -0.6382917761802673, "logps/chosen": -623.1561889648438, "logps/rejected": -723.26513671875, "loss": 0.6261, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.675720691680908, "rewards/margins": 1.3985204696655273, "rewards/rejected": -5.074240684509277, "step": 9140 }, { "epoch": 0.6, "grad_norm": 54.75, "learning_rate": 2.0771937260584034e-06, "logits/chosen": -1.209883689880371, "logits/rejected": -1.5710549354553223, "logps/chosen": -615.4727172851562, "logps/rejected": -755.8513793945312, "loss": 0.6809, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.9974637031555176, "rewards/margins": 1.110679268836975, "rewards/rejected": -5.108142375946045, "step": 9150 }, { "epoch": 0.6, "grad_norm": 54.75, "learning_rate": 2.0715667654651454e-06, "logits/chosen": -1.4411828517913818, "logits/rejected": -0.5171343088150024, "logps/chosen": -621.6505126953125, "logps/rejected": -807.4277954101562, "loss": 0.4333, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.8139426708221436, "rewards/margins": 2.2945456504821777, "rewards/rejected": -6.1084885597229, "step": 9160 }, { "epoch": 0.6, "grad_norm": 6.28125, "learning_rate": 2.0659420401145066e-06, "logits/chosen": -1.4201209545135498, "logits/rejected": -0.36391177773475647, "logps/chosen": -691.78466796875, "logps/rejected": -864.4724731445312, "loss": 0.5998, "rewards/accuracies": 0.875, "rewards/chosen": -4.295490264892578, "rewards/margins": 1.967839002609253, "rewards/rejected": -6.26332950592041, "step": 9170 }, { "epoch": 0.6, "grad_norm": 43.0, "learning_rate": 2.0603195793520743e-06, "logits/chosen": -1.3866536617279053, "logits/rejected": -1.072617769241333, "logps/chosen": -666.0487670898438, "logps/rejected": -828.4142456054688, "loss": 0.4909, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.077458381652832, "rewards/margins": 1.72836172580719, "rewards/rejected": -5.805819511413574, "step": 9180 }, { "epoch": 0.6, "grad_norm": 9.8125, "learning_rate": 2.0546994125116227e-06, "logits/chosen": -0.9660428762435913, "logits/rejected": -0.46666064858436584, "logps/chosen": -582.8367309570312, "logps/rejected": -733.2791137695312, "loss": 0.5119, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.8579773902893066, "rewards/margins": 1.4127486944198608, "rewards/rejected": -5.270726203918457, "step": 9190 }, { "epoch": 0.6, "grad_norm": 20.625, "learning_rate": 2.04908156891496e-06, "logits/chosen": -1.6387908458709717, "logits/rejected": -1.221289873123169, "logps/chosen": -593.5687255859375, "logps/rejected": -778.70751953125, "loss": 0.4619, "rewards/accuracies": 0.75, "rewards/chosen": -3.4691100120544434, "rewards/margins": 2.040217876434326, "rewards/rejected": -5.509328365325928, "step": 9200 }, { "epoch": 0.6, "eval_logits/chosen": -1.326981544494629, "eval_logits/rejected": -0.9207378625869751, "eval_logps/chosen": -683.789306640625, "eval_logps/rejected": -834.0685424804688, "eval_loss": 0.5202397108078003, "eval_rewards/accuracies": 0.7415000200271606, "eval_rewards/chosen": -4.191696643829346, "eval_rewards/margins": 1.7032716274261475, "eval_rewards/rejected": -5.8949689865112305, "eval_runtime": 1081.8071, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 9200 }, { "epoch": 0.6, "grad_norm": 29.625, "learning_rate": 2.0434660778717723e-06, "logits/chosen": -1.2855584621429443, "logits/rejected": -1.147896409034729, "logps/chosen": -714.60498046875, "logps/rejected": -795.3204345703125, "loss": 0.7521, "rewards/accuracies": 0.625, "rewards/chosen": -4.428966522216797, "rewards/margins": 0.816506028175354, "rewards/rejected": -5.245472431182861, "step": 9210 }, { "epoch": 0.6, "grad_norm": 18.125, "learning_rate": 2.0378529686794696e-06, "logits/chosen": -1.2469604015350342, "logits/rejected": -0.9728094935417175, "logps/chosen": -790.8663940429688, "logps/rejected": -865.6237182617188, "loss": 0.5651, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.530102252960205, "rewards/margins": 1.3570213317871094, "rewards/rejected": -5.887122631072998, "step": 9220 }, { "epoch": 0.6, "grad_norm": 12.5625, "learning_rate": 2.032242270623038e-06, "logits/chosen": -1.6159617900848389, "logits/rejected": -1.4232847690582275, "logps/chosen": -742.6838989257812, "logps/rejected": -800.5829467773438, "loss": 0.5988, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.089303016662598, "rewards/margins": 0.9831002950668335, "rewards/rejected": -5.072403907775879, "step": 9230 }, { "epoch": 0.6, "grad_norm": 18.125, "learning_rate": 2.0266340129748836e-06, "logits/chosen": -1.5057202577590942, "logits/rejected": -0.9572412371635437, "logps/chosen": -684.4494018554688, "logps/rejected": -831.32275390625, "loss": 0.4984, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.461272716522217, "rewards/margins": 1.5245856046676636, "rewards/rejected": -5.985858917236328, "step": 9240 }, { "epoch": 0.61, "grad_norm": 9.0625, "learning_rate": 2.0210282249946785e-06, "logits/chosen": -1.6169664859771729, "logits/rejected": -0.9237518310546875, "logps/chosen": -719.9649658203125, "logps/rejected": -834.8165283203125, "loss": 0.444, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.72863507270813, "rewards/margins": 1.8708362579345703, "rewards/rejected": -5.599471092224121, "step": 9250 }, { "epoch": 0.61, "grad_norm": 18.375, "learning_rate": 2.015424935929212e-06, "logits/chosen": -1.2418042421340942, "logits/rejected": -1.277060866355896, "logps/chosen": -656.4200439453125, "logps/rejected": -850.4601440429688, "loss": 0.4389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2504374980926514, "rewards/margins": 1.8390848636627197, "rewards/rejected": -5.089522361755371, "step": 9260 }, { "epoch": 0.61, "grad_norm": 70.0, "learning_rate": 2.009824175012235e-06, "logits/chosen": -1.2801949977874756, "logits/rejected": -1.2143417596817017, "logps/chosen": -589.7415771484375, "logps/rejected": -713.3714599609375, "loss": 0.528, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.2712810039520264, "rewards/margins": 1.3568689823150635, "rewards/rejected": -4.62814998626709, "step": 9270 }, { "epoch": 0.61, "grad_norm": 22.875, "learning_rate": 2.004225971464308e-06, "logits/chosen": -1.1419036388397217, "logits/rejected": -1.0101155042648315, "logps/chosen": -653.7296142578125, "logps/rejected": -721.9306640625, "loss": 0.8178, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.0132880210876465, "rewards/margins": 0.8811742663383484, "rewards/rejected": -4.8944621086120605, "step": 9280 }, { "epoch": 0.61, "grad_norm": 12.9375, "learning_rate": 1.9986303544926505e-06, "logits/chosen": -1.747882604598999, "logits/rejected": -1.2367589473724365, "logps/chosen": -611.9906005859375, "logps/rejected": -783.7923583984375, "loss": 0.3757, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.436415195465088, "rewards/margins": 1.8221994638442993, "rewards/rejected": -5.258614540100098, "step": 9290 }, { "epoch": 0.61, "grad_norm": 10.875, "learning_rate": 1.993037353290985e-06, "logits/chosen": -1.5499160289764404, "logits/rejected": -1.4632726907730103, "logps/chosen": -577.6898193359375, "logps/rejected": -834.36376953125, "loss": 0.3541, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.449538469314575, "rewards/margins": 1.9285895824432373, "rewards/rejected": -5.3781280517578125, "step": 9300 }, { "epoch": 0.61, "eval_logits/chosen": -1.5004675388336182, "eval_logits/rejected": -1.1179873943328857, "eval_logps/chosen": -608.5919189453125, "eval_logps/rejected": -743.0750122070312, "eval_loss": 0.5060610771179199, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -3.439723253250122, "eval_rewards/margins": 1.5453102588653564, "eval_rewards/rejected": -4.98503303527832, "eval_runtime": 1081.9969, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 9300 }, { "epoch": 0.61, "grad_norm": 13.5, "learning_rate": 1.98744699703939e-06, "logits/chosen": -1.409619927406311, "logits/rejected": -0.8134926557540894, "logps/chosen": -624.3377685546875, "logps/rejected": -733.558837890625, "loss": 0.5027, "rewards/accuracies": 0.75, "rewards/chosen": -3.636439561843872, "rewards/margins": 1.2627862691879272, "rewards/rejected": -4.89922571182251, "step": 9310 }, { "epoch": 0.61, "grad_norm": 40.25, "learning_rate": 1.9818593149041425e-06, "logits/chosen": -1.4757108688354492, "logits/rejected": -1.5214407444000244, "logps/chosen": -558.3359985351562, "logps/rejected": -741.0713500976562, "loss": 0.3825, "rewards/accuracies": 0.875, "rewards/chosen": -3.5006556510925293, "rewards/margins": 1.5507745742797852, "rewards/rejected": -5.0514302253723145, "step": 9320 }, { "epoch": 0.61, "grad_norm": 54.25, "learning_rate": 1.9762743360375673e-06, "logits/chosen": -1.0055055618286133, "logits/rejected": -1.1911537647247314, "logps/chosen": -675.6365966796875, "logps/rejected": -829.14306640625, "loss": 0.7008, "rewards/accuracies": 0.625, "rewards/chosen": -4.270930767059326, "rewards/margins": 1.1610107421875, "rewards/rejected": -5.431941032409668, "step": 9330 }, { "epoch": 0.61, "grad_norm": 33.75, "learning_rate": 1.9706920895778874e-06, "logits/chosen": -1.4728233814239502, "logits/rejected": -1.1093250513076782, "logps/chosen": -673.9620971679688, "logps/rejected": -779.1998901367188, "loss": 0.585, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.817030429840088, "rewards/margins": 1.2160959243774414, "rewards/rejected": -5.033125877380371, "step": 9340 }, { "epoch": 0.61, "grad_norm": 13.6875, "learning_rate": 1.965112604649069e-06, "logits/chosen": -1.2225950956344604, "logits/rejected": -0.49419888854026794, "logps/chosen": -619.0682983398438, "logps/rejected": -745.8886108398438, "loss": 0.446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.780285596847534, "rewards/margins": 1.524556279182434, "rewards/rejected": -5.304842472076416, "step": 9350 }, { "epoch": 0.61, "grad_norm": 55.0, "learning_rate": 1.959535910360671e-06, "logits/chosen": -1.4540631771087646, "logits/rejected": -0.8800407648086548, "logps/chosen": -647.1994018554688, "logps/rejected": -759.8973388671875, "loss": 0.4986, "rewards/accuracies": 0.75, "rewards/chosen": -4.0559821128845215, "rewards/margins": 1.539245843887329, "rewards/rejected": -5.59522819519043, "step": 9360 }, { "epoch": 0.61, "grad_norm": 13.375, "learning_rate": 1.9539620358076932e-06, "logits/chosen": -1.707156777381897, "logits/rejected": -1.4398860931396484, "logps/chosen": -694.5609741210938, "logps/rejected": -859.1438598632812, "loss": 0.3344, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5664284229278564, "rewards/margins": 2.0317790508270264, "rewards/rejected": -5.598206996917725, "step": 9370 }, { "epoch": 0.61, "grad_norm": 46.25, "learning_rate": 1.948391010070424e-06, "logits/chosen": -1.3961427211761475, "logits/rejected": -1.2206026315689087, "logps/chosen": -680.6915283203125, "logps/rejected": -821.5982666015625, "loss": 0.6005, "rewards/accuracies": 0.75, "rewards/chosen": -4.135708808898926, "rewards/margins": 1.5607359409332275, "rewards/rejected": -5.696445465087891, "step": 9380 }, { "epoch": 0.61, "grad_norm": 6.5, "learning_rate": 1.9428228622142875e-06, "logits/chosen": -1.3334215879440308, "logits/rejected": -0.9284563064575195, "logps/chosen": -590.3180541992188, "logps/rejected": -716.7012329101562, "loss": 0.4839, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4222378730773926, "rewards/margins": 1.6018590927124023, "rewards/rejected": -5.024096488952637, "step": 9390 }, { "epoch": 0.62, "grad_norm": 26.5, "learning_rate": 1.937257621289696e-06, "logits/chosen": -1.299994707107544, "logits/rejected": -0.802538275718689, "logps/chosen": -547.3518676757812, "logps/rejected": -791.9742431640625, "loss": 0.4268, "rewards/accuracies": 0.75, "rewards/chosen": -3.6831889152526855, "rewards/margins": 2.34443998336792, "rewards/rejected": -6.0276288986206055, "step": 9400 }, { "epoch": 0.62, "eval_logits/chosen": -1.400264024734497, "eval_logits/rejected": -0.9942511916160583, "eval_logps/chosen": -660.4188232421875, "eval_logps/rejected": -817.337158203125, "eval_loss": 0.5186948180198669, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -3.9579920768737793, "eval_rewards/margins": 1.7696630954742432, "eval_rewards/rejected": -5.727654933929443, "eval_runtime": 1081.7898, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 9400 }, { "epoch": 0.62, "grad_norm": 4.78125, "learning_rate": 1.9316953163318917e-06, "logits/chosen": -1.1624081134796143, "logits/rejected": -1.1278831958770752, "logps/chosen": -651.1795654296875, "logps/rejected": -839.9393310546875, "loss": 0.4538, "rewards/accuracies": 0.75, "rewards/chosen": -4.046165466308594, "rewards/margins": 1.9098514318466187, "rewards/rejected": -5.956017017364502, "step": 9410 }, { "epoch": 0.62, "grad_norm": 17.875, "learning_rate": 1.9261359763608036e-06, "logits/chosen": -1.4925538301467896, "logits/rejected": -0.6730778813362122, "logps/chosen": -671.7996826171875, "logps/rejected": -767.2442626953125, "loss": 0.7048, "rewards/accuracies": 0.625, "rewards/chosen": -4.301414489746094, "rewards/margins": 1.0777337551116943, "rewards/rejected": -5.379148006439209, "step": 9420 }, { "epoch": 0.62, "grad_norm": 13.3125, "learning_rate": 1.9205796303808886e-06, "logits/chosen": -1.2327535152435303, "logits/rejected": -0.6809760928153992, "logps/chosen": -630.4798583984375, "logps/rejected": -847.7352294921875, "loss": 0.6045, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.156689643859863, "rewards/margins": 2.157869815826416, "rewards/rejected": -6.314559459686279, "step": 9430 }, { "epoch": 0.62, "grad_norm": 6.75, "learning_rate": 1.915026307380984e-06, "logits/chosen": -1.3822481632232666, "logits/rejected": -1.0498870611190796, "logps/chosen": -663.9434814453125, "logps/rejected": -850.0187377929688, "loss": 0.4488, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.392586708068848, "rewards/margins": 1.7731012105941772, "rewards/rejected": -6.165687561035156, "step": 9440 }, { "epoch": 0.62, "grad_norm": 75.0, "learning_rate": 1.9094760363341553e-06, "logits/chosen": -1.261415719985962, "logits/rejected": -1.1786569356918335, "logps/chosen": -705.1206665039062, "logps/rejected": -970.0733642578125, "loss": 0.452, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.230937957763672, "rewards/margins": 2.7727277278900146, "rewards/rejected": -7.003665924072266, "step": 9450 }, { "epoch": 0.62, "grad_norm": 65.5, "learning_rate": 1.9039288461975456e-06, "logits/chosen": -1.1310489177703857, "logits/rejected": -0.9868417978286743, "logps/chosen": -762.7117309570312, "logps/rejected": -918.0983276367188, "loss": 1.2643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.1901044845581055, "rewards/margins": 1.1431361436843872, "rewards/rejected": -6.333239555358887, "step": 9460 }, { "epoch": 0.62, "grad_norm": 29.5, "learning_rate": 1.898384765912224e-06, "logits/chosen": -1.4956533908843994, "logits/rejected": -0.6245728135108948, "logps/chosen": -692.7674560546875, "logps/rejected": -938.5187377929688, "loss": 0.4786, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.06633186340332, "rewards/margins": 2.7912983894348145, "rewards/rejected": -6.857630252838135, "step": 9470 }, { "epoch": 0.62, "grad_norm": 12.75, "learning_rate": 1.892843824403034e-06, "logits/chosen": -0.9632704854011536, "logits/rejected": -0.37318065762519836, "logps/chosen": -732.5442504882812, "logps/rejected": -913.6612548828125, "loss": 0.3652, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.5393548011779785, "rewards/margins": 2.377885341644287, "rewards/rejected": -6.917239189147949, "step": 9480 }, { "epoch": 0.62, "grad_norm": 41.0, "learning_rate": 1.8873060505784446e-06, "logits/chosen": -1.3910750150680542, "logits/rejected": -1.1058140993118286, "logps/chosen": -681.1951293945312, "logps/rejected": -844.4328002929688, "loss": 0.4411, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9603819847106934, "rewards/margins": 1.6514616012573242, "rewards/rejected": -5.611843109130859, "step": 9490 }, { "epoch": 0.62, "grad_norm": 27.375, "learning_rate": 1.8817714733303972e-06, "logits/chosen": -1.227161169052124, "logits/rejected": -1.0545676946640015, "logps/chosen": -696.9172973632812, "logps/rejected": -859.1433715820312, "loss": 0.6392, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.237255573272705, "rewards/margins": 1.3725202083587646, "rewards/rejected": -5.609775543212891, "step": 9500 }, { "epoch": 0.62, "eval_logits/chosen": -1.3307511806488037, "eval_logits/rejected": -0.8994446396827698, "eval_logps/chosen": -683.0696411132812, "eval_logps/rejected": -851.5308837890625, "eval_loss": 0.5297824740409851, "eval_rewards/accuracies": 0.7384999990463257, "eval_rewards/chosen": -4.184500694274902, "eval_rewards/margins": 1.8850908279418945, "eval_rewards/rejected": -6.069591522216797, "eval_runtime": 1082.0042, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 9500 }, { "epoch": 0.62, "grad_norm": 21.0, "learning_rate": 1.8762401215341569e-06, "logits/chosen": -1.4877293109893799, "logits/rejected": -0.35881510376930237, "logps/chosen": -739.7760009765625, "logps/rejected": -874.1494140625, "loss": 0.5602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.381041049957275, "rewards/margins": 1.7861883640289307, "rewards/rejected": -6.167229652404785, "step": 9510 }, { "epoch": 0.62, "grad_norm": 23.75, "learning_rate": 1.8707120240481582e-06, "logits/chosen": -1.0102797746658325, "logits/rejected": -0.9458551406860352, "logps/chosen": -680.5265502929688, "logps/rejected": -793.0050048828125, "loss": 0.7213, "rewards/accuracies": 0.75, "rewards/chosen": -4.062864780426025, "rewards/margins": 1.4776676893234253, "rewards/rejected": -5.540532112121582, "step": 9520 }, { "epoch": 0.62, "grad_norm": 7.21875, "learning_rate": 1.8651872097138618e-06, "logits/chosen": -1.4467136859893799, "logits/rejected": -0.7643300890922546, "logps/chosen": -591.2861328125, "logps/rejected": -772.3933715820312, "loss": 0.5993, "rewards/accuracies": 0.75, "rewards/chosen": -3.4488213062286377, "rewards/margins": 2.1533913612365723, "rewards/rejected": -5.602212429046631, "step": 9530 }, { "epoch": 0.62, "grad_norm": 25.125, "learning_rate": 1.8596657073555946e-06, "logits/chosen": -1.2246088981628418, "logits/rejected": -1.104487657546997, "logps/chosen": -655.0626220703125, "logps/rejected": -815.7304077148438, "loss": 0.4584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.037166595458984, "rewards/margins": 1.7237437963485718, "rewards/rejected": -5.760910511016846, "step": 9540 }, { "epoch": 0.62, "grad_norm": 38.75, "learning_rate": 1.854147545780407e-06, "logits/chosen": -1.4390347003936768, "logits/rejected": -0.589356541633606, "logps/chosen": -639.9791259765625, "logps/rejected": -840.6633911132812, "loss": 0.461, "rewards/accuracies": 0.75, "rewards/chosen": -3.830765962600708, "rewards/margins": 2.0415902137756348, "rewards/rejected": -5.87235689163208, "step": 9550 }, { "epoch": 0.63, "grad_norm": 11.1875, "learning_rate": 1.8486327537779181e-06, "logits/chosen": -1.2075833082199097, "logits/rejected": -1.158754587173462, "logps/chosen": -667.5443725585938, "logps/rejected": -863.9959106445312, "loss": 0.525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.084089279174805, "rewards/margins": 1.9394521713256836, "rewards/rejected": -6.023541450500488, "step": 9560 }, { "epoch": 0.63, "grad_norm": 10.4375, "learning_rate": 1.8431213601201685e-06, "logits/chosen": -1.4990732669830322, "logits/rejected": -0.9672231674194336, "logps/chosen": -657.0826416015625, "logps/rejected": -759.6051025390625, "loss": 0.5278, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7134532928466797, "rewards/margins": 1.2801915407180786, "rewards/rejected": -4.993645668029785, "step": 9570 }, { "epoch": 0.63, "grad_norm": 64.0, "learning_rate": 1.8376133935614683e-06, "logits/chosen": -1.4979077577590942, "logits/rejected": -0.7974768877029419, "logps/chosen": -739.82666015625, "logps/rejected": -828.7547607421875, "loss": 0.5374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.093414783477783, "rewards/margins": 1.5830862522125244, "rewards/rejected": -5.6765007972717285, "step": 9580 }, { "epoch": 0.63, "grad_norm": 64.5, "learning_rate": 1.8321088828382454e-06, "logits/chosen": -0.9909995198249817, "logits/rejected": -0.5220507383346558, "logps/chosen": -665.5559692382812, "logps/rejected": -835.7369384765625, "loss": 0.4551, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.232287406921387, "rewards/margins": 1.7227668762207031, "rewards/rejected": -5.95505428314209, "step": 9590 }, { "epoch": 0.63, "grad_norm": 18.875, "learning_rate": 1.8266078566689023e-06, "logits/chosen": -1.3309686183929443, "logits/rejected": -0.7470362782478333, "logps/chosen": -710.2805786132812, "logps/rejected": -833.3356323242188, "loss": 0.6151, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.184802532196045, "rewards/margins": 1.6399791240692139, "rewards/rejected": -5.824782371520996, "step": 9600 }, { "epoch": 0.63, "eval_logits/chosen": -1.3883379697799683, "eval_logits/rejected": -0.9558569192886353, "eval_logps/chosen": -653.8218994140625, "eval_logps/rejected": -815.56298828125, "eval_loss": 0.5237345099449158, "eval_rewards/accuracies": 0.7440000176429749, "eval_rewards/chosen": -3.8920223712921143, "eval_rewards/margins": 1.8178907632827759, "eval_rewards/rejected": -5.70991325378418, "eval_runtime": 1082.2917, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 9600 }, { "epoch": 0.63, "grad_norm": 64.5, "learning_rate": 1.821110343753657e-06, "logits/chosen": -1.5200971364974976, "logits/rejected": -0.7219610214233398, "logps/chosen": -610.89453125, "logps/rejected": -804.0982666015625, "loss": 0.626, "rewards/accuracies": 0.75, "rewards/chosen": -4.186071395874023, "rewards/margins": 1.964112639427185, "rewards/rejected": -6.15018367767334, "step": 9610 }, { "epoch": 0.63, "grad_norm": 30.0, "learning_rate": 1.8156163727744015e-06, "logits/chosen": -1.4919555187225342, "logits/rejected": -1.231110692024231, "logps/chosen": -598.7774658203125, "logps/rejected": -851.2254638671875, "loss": 0.5567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7840869426727295, "rewards/margins": 2.015737533569336, "rewards/rejected": -5.7998247146606445, "step": 9620 }, { "epoch": 0.63, "grad_norm": 23.875, "learning_rate": 1.8101259723945459e-06, "logits/chosen": -1.4093317985534668, "logits/rejected": -0.7855949401855469, "logps/chosen": -672.3359375, "logps/rejected": -861.9812622070312, "loss": 0.4329, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.501742124557495, "rewards/margins": 2.1253974437713623, "rewards/rejected": -5.627139091491699, "step": 9630 }, { "epoch": 0.63, "grad_norm": 11.4375, "learning_rate": 1.8046391712588746e-06, "logits/chosen": -1.5120878219604492, "logits/rejected": -1.3128072023391724, "logps/chosen": -541.7186279296875, "logps/rejected": -804.45263671875, "loss": 0.2403, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3221206665039062, "rewards/margins": 2.5772669315338135, "rewards/rejected": -5.899387836456299, "step": 9640 }, { "epoch": 0.63, "grad_norm": 9.375, "learning_rate": 1.799155997993391e-06, "logits/chosen": -1.2168834209442139, "logits/rejected": -1.1701622009277344, "logps/chosen": -634.354248046875, "logps/rejected": -820.5637817382812, "loss": 0.5102, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.9395415782928467, "rewards/margins": 1.9209792613983154, "rewards/rejected": -5.860520362854004, "step": 9650 }, { "epoch": 0.63, "grad_norm": 16.375, "learning_rate": 1.7936764812051728e-06, "logits/chosen": -1.4013276100158691, "logits/rejected": -1.3793084621429443, "logps/chosen": -604.1414794921875, "logps/rejected": -740.4356689453125, "loss": 0.4523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5337860584259033, "rewards/margins": 1.5189335346221924, "rewards/rejected": -5.052720069885254, "step": 9660 }, { "epoch": 0.63, "grad_norm": 26.875, "learning_rate": 1.7882006494822197e-06, "logits/chosen": -1.3162987232208252, "logits/rejected": -0.8512665629386902, "logps/chosen": -682.7770385742188, "logps/rejected": -816.5172119140625, "loss": 0.5956, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.210787773132324, "rewards/margins": 1.4024453163146973, "rewards/rejected": -5.613232612609863, "step": 9670 }, { "epoch": 0.63, "grad_norm": 11.75, "learning_rate": 1.7827285313933063e-06, "logits/chosen": -1.067264437675476, "logits/rejected": -1.258851408958435, "logps/chosen": -617.1749267578125, "logps/rejected": -750.3770751953125, "loss": 0.6638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.804946184158325, "rewards/margins": 1.5089433193206787, "rewards/rejected": -5.313889503479004, "step": 9680 }, { "epoch": 0.63, "grad_norm": 17.625, "learning_rate": 1.7772601554878326e-06, "logits/chosen": -1.7114393711090088, "logits/rejected": -1.1726324558258057, "logps/chosen": -637.5438842773438, "logps/rejected": -821.091796875, "loss": 0.3805, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.78291392326355, "rewards/margins": 2.1484310626983643, "rewards/rejected": -5.931345462799072, "step": 9690 }, { "epoch": 0.63, "grad_norm": 15.0625, "learning_rate": 1.7717955502956734e-06, "logits/chosen": -1.5776903629302979, "logits/rejected": -0.8023563623428345, "logps/chosen": -735.2310791015625, "logps/rejected": -904.5573120117188, "loss": 0.4596, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.131260871887207, "rewards/margins": 2.2760369777679443, "rewards/rejected": -6.407297611236572, "step": 9700 }, { "epoch": 0.63, "eval_logits/chosen": -1.4511213302612305, "eval_logits/rejected": -1.0611268281936646, "eval_logps/chosen": -644.0645141601562, "eval_logps/rejected": -812.1489868164062, "eval_loss": 0.5332624316215515, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -3.794449806213379, "eval_rewards/margins": 1.8813235759735107, "eval_rewards/rejected": -5.675773620605469, "eval_runtime": 1082.0819, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 9700 }, { "epoch": 0.64, "grad_norm": 15.6875, "learning_rate": 1.7663347443270326e-06, "logits/chosen": -1.2006757259368896, "logits/rejected": -1.1646760702133179, "logps/chosen": -689.7564697265625, "logps/rejected": -911.4365234375, "loss": 0.5608, "rewards/accuracies": 0.75, "rewards/chosen": -4.183384895324707, "rewards/margins": 2.0117850303649902, "rewards/rejected": -6.1951704025268555, "step": 9710 }, { "epoch": 0.64, "grad_norm": 48.25, "learning_rate": 1.7608777660722908e-06, "logits/chosen": -1.679500937461853, "logits/rejected": -0.6159299612045288, "logps/chosen": -648.7130126953125, "logps/rejected": -774.0250244140625, "loss": 0.6158, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.966818332672119, "rewards/margins": 1.734483003616333, "rewards/rejected": -5.701301097869873, "step": 9720 }, { "epoch": 0.64, "grad_norm": 4.6875, "learning_rate": 1.7554246440018586e-06, "logits/chosen": -1.0722570419311523, "logits/rejected": -0.10302643477916718, "logps/chosen": -634.0631713867188, "logps/rejected": -816.153564453125, "loss": 0.354, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.703505754470825, "rewards/margins": 2.7512030601501465, "rewards/rejected": -6.454709053039551, "step": 9730 }, { "epoch": 0.64, "grad_norm": 6.25, "learning_rate": 1.7499754065660288e-06, "logits/chosen": -0.9838741421699524, "logits/rejected": -0.9053605794906616, "logps/chosen": -611.5914306640625, "logps/rejected": -882.67236328125, "loss": 0.3153, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.09970760345459, "rewards/margins": 2.475245952606201, "rewards/rejected": -6.574953556060791, "step": 9740 }, { "epoch": 0.64, "grad_norm": 24.375, "learning_rate": 1.744530082194828e-06, "logits/chosen": -1.2665590047836304, "logits/rejected": -1.2727930545806885, "logps/chosen": -664.0533447265625, "logps/rejected": -835.3433837890625, "loss": 0.5467, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.182080268859863, "rewards/margins": 1.8053550720214844, "rewards/rejected": -5.9874348640441895, "step": 9750 }, { "epoch": 0.64, "grad_norm": 50.5, "learning_rate": 1.7390886992978653e-06, "logits/chosen": -1.1944701671600342, "logits/rejected": -1.4005664587020874, "logps/chosen": -634.5916137695312, "logps/rejected": -907.9439697265625, "loss": 0.3575, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.708355665206909, "rewards/margins": 2.5377109050750732, "rewards/rejected": -6.246066093444824, "step": 9760 }, { "epoch": 0.64, "grad_norm": 39.5, "learning_rate": 1.7336512862641874e-06, "logits/chosen": -0.9762150049209595, "logits/rejected": -0.5215164422988892, "logps/chosen": -669.7718505859375, "logps/rejected": -826.00390625, "loss": 0.6167, "rewards/accuracies": 0.75, "rewards/chosen": -4.191767692565918, "rewards/margins": 1.9897741079330444, "rewards/rejected": -6.181541919708252, "step": 9770 }, { "epoch": 0.64, "grad_norm": 6.0625, "learning_rate": 1.7282178714621288e-06, "logits/chosen": -1.2495208978652954, "logits/rejected": -0.8139098286628723, "logps/chosen": -703.7184448242188, "logps/rejected": -849.064453125, "loss": 0.4983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.3152360916137695, "rewards/margins": 1.860706090927124, "rewards/rejected": -6.175942420959473, "step": 9780 }, { "epoch": 0.64, "grad_norm": 37.5, "learning_rate": 1.7227884832391637e-06, "logits/chosen": -1.4152500629425049, "logits/rejected": -1.3036797046661377, "logps/chosen": -813.8082885742188, "logps/rejected": -968.1262817382812, "loss": 0.5957, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.684041500091553, "rewards/margins": 2.0027377605438232, "rewards/rejected": -6.686779022216797, "step": 9790 }, { "epoch": 0.64, "grad_norm": 46.0, "learning_rate": 1.7173631499217602e-06, "logits/chosen": -1.1232144832611084, "logits/rejected": -0.5624216794967651, "logps/chosen": -743.7877197265625, "logps/rejected": -946.9937744140625, "loss": 0.6714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.898040771484375, "rewards/margins": 2.1317381858825684, "rewards/rejected": -7.029778957366943, "step": 9800 }, { "epoch": 0.64, "eval_logits/chosen": -1.3444602489471436, "eval_logits/rejected": -0.9338452219963074, "eval_logps/chosen": -707.3235473632812, "eval_logps/rejected": -902.2876586914062, "eval_loss": 0.5592412948608398, "eval_rewards/accuracies": 0.7384999990463257, "eval_rewards/chosen": -4.42703914642334, "eval_rewards/margins": 2.150120258331299, "eval_rewards/rejected": -6.577159404754639, "eval_runtime": 1081.7805, "eval_samples_per_second": 1.849, "eval_steps_per_second": 1.849, "step": 9800 }, { "epoch": 0.64, "grad_norm": 35.25, "learning_rate": 1.7119418998152287e-06, "logits/chosen": -1.3319625854492188, "logits/rejected": -0.6727578639984131, "logps/chosen": -718.9151000976562, "logps/rejected": -900.5543212890625, "loss": 0.5986, "rewards/accuracies": 0.75, "rewards/chosen": -4.532338619232178, "rewards/margins": 1.7879562377929688, "rewards/rejected": -6.3202948570251465, "step": 9810 }, { "epoch": 0.64, "grad_norm": 15.4375, "learning_rate": 1.7065247612035793e-06, "logits/chosen": -1.3571736812591553, "logits/rejected": -0.348560631275177, "logps/chosen": -711.55224609375, "logps/rejected": -925.8137817382812, "loss": 0.3427, "rewards/accuracies": 0.875, "rewards/chosen": -4.6497392654418945, "rewards/margins": 2.650221109390259, "rewards/rejected": -7.299959659576416, "step": 9820 }, { "epoch": 0.64, "grad_norm": 24.0, "learning_rate": 1.7011117623493691e-06, "logits/chosen": -1.2337242364883423, "logits/rejected": 0.019543254747986794, "logps/chosen": -798.2955322265625, "logps/rejected": -959.3108520507812, "loss": 0.5114, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.815093994140625, "rewards/margins": 2.3745789527893066, "rewards/rejected": -7.18967342376709, "step": 9830 }, { "epoch": 0.64, "grad_norm": 67.0, "learning_rate": 1.6957029314935575e-06, "logits/chosen": -1.365659475326538, "logits/rejected": -0.7072588801383972, "logps/chosen": -780.5360107421875, "logps/rejected": -878.8068237304688, "loss": 0.8296, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.9021196365356445, "rewards/margins": 1.3420066833496094, "rewards/rejected": -6.244126319885254, "step": 9840 }, { "epoch": 0.64, "grad_norm": 40.0, "learning_rate": 1.6902982968553588e-06, "logits/chosen": -1.3933206796646118, "logits/rejected": -0.6576050519943237, "logps/chosen": -683.742919921875, "logps/rejected": -875.6728515625, "loss": 0.7099, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.527164459228516, "rewards/margins": 1.9458363056182861, "rewards/rejected": -6.473000526428223, "step": 9850 }, { "epoch": 0.65, "grad_norm": 31.5, "learning_rate": 1.6848978866320955e-06, "logits/chosen": -1.3975956439971924, "logits/rejected": -0.9457235336303711, "logps/chosen": -712.5390625, "logps/rejected": -799.8958129882812, "loss": 0.657, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.589311599731445, "rewards/margins": 1.5117348432540894, "rewards/rejected": -6.101046562194824, "step": 9860 }, { "epoch": 0.65, "grad_norm": 20.875, "learning_rate": 1.679501728999049e-06, "logits/chosen": -1.6311700344085693, "logits/rejected": -0.9650055170059204, "logps/chosen": -688.1824340820312, "logps/rejected": -834.74365234375, "loss": 0.4195, "rewards/accuracies": 0.75, "rewards/chosen": -4.1752519607543945, "rewards/margins": 1.9379059076309204, "rewards/rejected": -6.113158226013184, "step": 9870 }, { "epoch": 0.65, "grad_norm": 20.5, "learning_rate": 1.6741098521093136e-06, "logits/chosen": -1.4847437143325806, "logits/rejected": -0.9676124453544617, "logps/chosen": -645.314453125, "logps/rejected": -840.8898315429688, "loss": 0.4666, "rewards/accuracies": 0.75, "rewards/chosen": -3.9374442100524902, "rewards/margins": 2.1586430072784424, "rewards/rejected": -6.096087455749512, "step": 9880 }, { "epoch": 0.65, "grad_norm": 37.5, "learning_rate": 1.6687222840936513e-06, "logits/chosen": -1.440901517868042, "logits/rejected": -1.0438569784164429, "logps/chosen": -814.6585693359375, "logps/rejected": -1079.14453125, "loss": 0.4971, "rewards/accuracies": 0.75, "rewards/chosen": -5.045413017272949, "rewards/margins": 2.538839101791382, "rewards/rejected": -7.584251403808594, "step": 9890 }, { "epoch": 0.65, "grad_norm": 30.375, "learning_rate": 1.6633390530603417e-06, "logits/chosen": -1.1210899353027344, "logits/rejected": -0.6491280794143677, "logps/chosen": -674.04345703125, "logps/rejected": -887.1945190429688, "loss": 0.6304, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.508164405822754, "rewards/margins": 1.9435460567474365, "rewards/rejected": -6.4517107009887695, "step": 9900 }, { "epoch": 0.65, "eval_logits/chosen": -1.3756355047225952, "eval_logits/rejected": -0.9850223064422607, "eval_logps/chosen": -708.5909423828125, "eval_logps/rejected": -888.516357421875, "eval_loss": 0.5397689342498779, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -4.439713954925537, "eval_rewards/margins": 1.9997323751449585, "eval_rewards/rejected": -6.439446926116943, "eval_runtime": 1082.0889, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 9900 }, { "epoch": 0.65, "grad_norm": 19.75, "learning_rate": 1.6579601870950398e-06, "logits/chosen": -1.5334781408309937, "logits/rejected": -0.7680279016494751, "logps/chosen": -786.6510009765625, "logps/rejected": -1047.2591552734375, "loss": 0.3099, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.849265098571777, "rewards/margins": 3.07627534866333, "rewards/rejected": -7.925540924072266, "step": 9910 }, { "epoch": 0.65, "grad_norm": 2.125, "learning_rate": 1.652585714260625e-06, "logits/chosen": -1.3284709453582764, "logits/rejected": -1.0961860418319702, "logps/chosen": -642.3963623046875, "logps/rejected": -802.1507568359375, "loss": 0.5477, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.202449321746826, "rewards/margins": 1.8108571767807007, "rewards/rejected": -6.013306617736816, "step": 9920 }, { "epoch": 0.65, "grad_norm": 11.5625, "learning_rate": 1.6472156625970565e-06, "logits/chosen": -0.9901440739631653, "logits/rejected": -1.1614863872528076, "logps/chosen": -646.3113403320312, "logps/rejected": -941.3834838867188, "loss": 0.5016, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.170406818389893, "rewards/margins": 2.5094244480133057, "rewards/rejected": -6.679831027984619, "step": 9930 }, { "epoch": 0.65, "grad_norm": 12.0625, "learning_rate": 1.6418500601212288e-06, "logits/chosen": -1.2866747379302979, "logits/rejected": -1.219088077545166, "logps/chosen": -690.6443481445312, "logps/rejected": -836.4072265625, "loss": 0.5968, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.075140476226807, "rewards/margins": 1.9261252880096436, "rewards/rejected": -6.001265525817871, "step": 9940 }, { "epoch": 0.65, "grad_norm": 7.78125, "learning_rate": 1.6364889348268225e-06, "logits/chosen": -1.5702639818191528, "logits/rejected": -0.24345020949840546, "logps/chosen": -674.7689208984375, "logps/rejected": -891.9716796875, "loss": 0.4872, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.367429733276367, "rewards/margins": 2.536357879638672, "rewards/rejected": -6.903788089752197, "step": 9950 }, { "epoch": 0.65, "grad_norm": 20.0, "learning_rate": 1.6311323146841598e-06, "logits/chosen": -1.3737237453460693, "logits/rejected": -0.5897197127342224, "logps/chosen": -704.71875, "logps/rejected": -845.8082275390625, "loss": 0.9811, "rewards/accuracies": 0.75, "rewards/chosen": -4.680957317352295, "rewards/margins": 1.693399429321289, "rewards/rejected": -6.374356746673584, "step": 9960 }, { "epoch": 0.65, "grad_norm": 32.25, "learning_rate": 1.6257802276400604e-06, "logits/chosen": -1.286382794380188, "logits/rejected": -0.9835619926452637, "logps/chosen": -659.3897705078125, "logps/rejected": -883.0843505859375, "loss": 0.5428, "rewards/accuracies": 0.75, "rewards/chosen": -4.1121907234191895, "rewards/margins": 2.2841832637786865, "rewards/rejected": -6.396373748779297, "step": 9970 }, { "epoch": 0.65, "grad_norm": 43.0, "learning_rate": 1.6204327016176902e-06, "logits/chosen": -1.466408371925354, "logits/rejected": -0.8900888562202454, "logps/chosen": -721.4140625, "logps/rejected": -864.7144775390625, "loss": 0.537, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.17879581451416, "rewards/margins": 1.8766186237335205, "rewards/rejected": -6.055413722991943, "step": 9980 }, { "epoch": 0.65, "grad_norm": 36.0, "learning_rate": 1.6150897645164209e-06, "logits/chosen": -1.2164056301116943, "logits/rejected": -0.5492855310440063, "logps/chosen": -644.7044677734375, "logps/rejected": -863.64404296875, "loss": 0.5214, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.051421165466309, "rewards/margins": 1.8911558389663696, "rewards/rejected": -5.942576885223389, "step": 9990 }, { "epoch": 0.65, "grad_norm": 2.234375, "learning_rate": 1.6097514442116844e-06, "logits/chosen": -1.4516348838806152, "logits/rejected": -1.074739694595337, "logps/chosen": -639.0194091796875, "logps/rejected": -827.8511962890625, "loss": 0.463, "rewards/accuracies": 0.75, "rewards/chosen": -3.8107712268829346, "rewards/margins": 1.998748779296875, "rewards/rejected": -5.809520244598389, "step": 10000 }, { "epoch": 0.65, "eval_logits/chosen": -1.4191621541976929, "eval_logits/rejected": -1.041384220123291, "eval_logps/chosen": -685.0887451171875, "eval_logps/rejected": -855.3673706054688, "eval_loss": 0.5291036367416382, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -4.204691410064697, "eval_rewards/margins": 1.9032647609710693, "eval_rewards/rejected": -6.107956886291504, "eval_runtime": 1082.1823, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10000 }, { "epoch": 0.65, "grad_norm": 12.75, "learning_rate": 1.6044177685548217e-06, "logits/chosen": -1.5657777786254883, "logits/rejected": -0.8757058382034302, "logps/chosen": -744.9771118164062, "logps/rejected": -894.1316528320312, "loss": 0.5941, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.824192047119141, "rewards/margins": 1.813452124595642, "rewards/rejected": -6.637644290924072, "step": 10010 }, { "epoch": 0.66, "grad_norm": 13.4375, "learning_rate": 1.5990887653729455e-06, "logits/chosen": -1.631326675415039, "logits/rejected": -1.0424257516860962, "logps/chosen": -708.4505004882812, "logps/rejected": -909.6574096679688, "loss": 0.5447, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.098111152648926, "rewards/margins": 2.2930634021759033, "rewards/rejected": -6.391175270080566, "step": 10020 }, { "epoch": 0.66, "grad_norm": 15.0, "learning_rate": 1.5937644624687886e-06, "logits/chosen": -1.2427434921264648, "logits/rejected": -0.7414315938949585, "logps/chosen": -711.0717163085938, "logps/rejected": -833.2828369140625, "loss": 0.5621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.499980449676514, "rewards/margins": 1.6926298141479492, "rewards/rejected": -6.192610263824463, "step": 10030 }, { "epoch": 0.66, "grad_norm": 57.0, "learning_rate": 1.5884448876205613e-06, "logits/chosen": -1.725219488143921, "logits/rejected": -1.221382975578308, "logps/chosen": -713.7057495117188, "logps/rejected": -845.9508666992188, "loss": 0.4946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.284564971923828, "rewards/margins": 1.5368564128875732, "rewards/rejected": -5.8214216232299805, "step": 10040 }, { "epoch": 0.66, "grad_norm": 6.6875, "learning_rate": 1.5831300685818082e-06, "logits/chosen": -1.590414047241211, "logits/rejected": -0.7851971387863159, "logps/chosen": -721.3682250976562, "logps/rejected": -918.1746826171875, "loss": 0.4682, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.38002872467041, "rewards/margins": 2.2454450130462646, "rewards/rejected": -6.6254730224609375, "step": 10050 }, { "epoch": 0.66, "grad_norm": 36.0, "learning_rate": 1.5778200330812587e-06, "logits/chosen": -1.400907039642334, "logits/rejected": -0.8004618883132935, "logps/chosen": -746.0275268554688, "logps/rejected": -1022.3947143554688, "loss": 0.4688, "rewards/accuracies": 0.75, "rewards/chosen": -4.639899253845215, "rewards/margins": 2.668985366821289, "rewards/rejected": -7.308885097503662, "step": 10060 }, { "epoch": 0.66, "grad_norm": 21.25, "learning_rate": 1.57251480882269e-06, "logits/chosen": -1.0994516611099243, "logits/rejected": -1.5077083110809326, "logps/chosen": -680.7872924804688, "logps/rejected": -865.4332275390625, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": -4.255684852600098, "rewards/margins": 1.9739439487457275, "rewards/rejected": -6.229628562927246, "step": 10070 }, { "epoch": 0.66, "grad_norm": 14.625, "learning_rate": 1.5672144234847725e-06, "logits/chosen": -1.2916462421417236, "logits/rejected": -1.3630914688110352, "logps/chosen": -712.6106567382812, "logps/rejected": -911.7330322265625, "loss": 0.3668, "rewards/accuracies": 0.875, "rewards/chosen": -4.437014102935791, "rewards/margins": 2.253030776977539, "rewards/rejected": -6.690045356750488, "step": 10080 }, { "epoch": 0.66, "grad_norm": 9.8125, "learning_rate": 1.561918904720935e-06, "logits/chosen": -1.632168173789978, "logits/rejected": -1.194054126739502, "logps/chosen": -782.8458862304688, "logps/rejected": -975.6322021484375, "loss": 0.4002, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.582988739013672, "rewards/margins": 2.29475736618042, "rewards/rejected": -6.877745628356934, "step": 10090 }, { "epoch": 0.66, "grad_norm": 9.9375, "learning_rate": 1.5566282801592131e-06, "logits/chosen": -1.4893324375152588, "logits/rejected": -0.9826874732971191, "logps/chosen": -703.5165405273438, "logps/rejected": -931.4017333984375, "loss": 0.4455, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.347476482391357, "rewards/margins": 2.5361175537109375, "rewards/rejected": -6.883593559265137, "step": 10100 }, { "epoch": 0.66, "eval_logits/chosen": -1.3677505254745483, "eval_logits/rejected": -0.983038067817688, "eval_logps/chosen": -721.8720703125, "eval_logps/rejected": -903.6422119140625, "eval_loss": 0.543100893497467, "eval_rewards/accuracies": 0.7450000047683716, "eval_rewards/chosen": -4.572524547576904, "eval_rewards/margins": 2.018181085586548, "eval_rewards/rejected": -6.590704917907715, "eval_runtime": 1082.2322, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10100 }, { "epoch": 0.66, "grad_norm": 4.25, "learning_rate": 1.55134257740211e-06, "logits/chosen": -1.258418083190918, "logits/rejected": -1.037466049194336, "logps/chosen": -717.6865844726562, "logps/rejected": -959.0203857421875, "loss": 0.5658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.453530311584473, "rewards/margins": 2.736948251724243, "rewards/rejected": -7.1904778480529785, "step": 10110 }, { "epoch": 0.66, "grad_norm": 90.0, "learning_rate": 1.5460618240264508e-06, "logits/chosen": -1.1420419216156006, "logits/rejected": -1.0841926336288452, "logps/chosen": -608.7236938476562, "logps/rejected": -779.9000244140625, "loss": 0.576, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.236821174621582, "rewards/margins": 1.8547357320785522, "rewards/rejected": -6.091557502746582, "step": 10120 }, { "epoch": 0.66, "grad_norm": 9.4375, "learning_rate": 1.5407860475832362e-06, "logits/chosen": -1.1833328008651733, "logits/rejected": -1.2463109493255615, "logps/chosen": -653.4122924804688, "logps/rejected": -784.2322998046875, "loss": 0.5876, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9241669178009033, "rewards/margins": 1.6907835006713867, "rewards/rejected": -5.614950180053711, "step": 10130 }, { "epoch": 0.66, "grad_norm": 45.75, "learning_rate": 1.5355152755975038e-06, "logits/chosen": -1.5479705333709717, "logits/rejected": -0.9517908096313477, "logps/chosen": -610.8402099609375, "logps/rejected": -750.8898315429688, "loss": 0.4098, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.7124619483947754, "rewards/margins": 1.8992376327514648, "rewards/rejected": -5.611699104309082, "step": 10140 }, { "epoch": 0.66, "grad_norm": 3.578125, "learning_rate": 1.5302495355681795e-06, "logits/chosen": -1.1657073497772217, "logits/rejected": -1.1729596853256226, "logps/chosen": -637.9486083984375, "logps/rejected": -879.9600830078125, "loss": 0.374, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.251258850097656, "rewards/margins": 2.30987286567688, "rewards/rejected": -6.561131954193115, "step": 10150 }, { "epoch": 0.66, "grad_norm": 17.0, "learning_rate": 1.5249888549679382e-06, "logits/chosen": -1.5188274383544922, "logits/rejected": -0.9803338050842285, "logps/chosen": -735.1993408203125, "logps/rejected": -921.6483154296875, "loss": 0.5359, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.554887294769287, "rewards/margins": 2.08750319480896, "rewards/rejected": -6.642390251159668, "step": 10160 }, { "epoch": 0.67, "grad_norm": 12.8125, "learning_rate": 1.519733261243056e-06, "logits/chosen": -1.3887909650802612, "logits/rejected": -0.23237380385398865, "logps/chosen": -750.53369140625, "logps/rejected": -958.8772583007812, "loss": 0.4631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.962498188018799, "rewards/margins": 2.295506000518799, "rewards/rejected": -7.258004188537598, "step": 10170 }, { "epoch": 0.67, "grad_norm": 101.5, "learning_rate": 1.5144827818132723e-06, "logits/chosen": -1.0095363855361938, "logits/rejected": -0.10261247307062149, "logps/chosen": -782.8634033203125, "logps/rejected": -883.9385986328125, "loss": 0.7483, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.3678297996521, "rewards/margins": 1.619797706604004, "rewards/rejected": -6.9876275062561035, "step": 10180 }, { "epoch": 0.67, "grad_norm": 15.0625, "learning_rate": 1.5092374440716422e-06, "logits/chosen": -1.186562418937683, "logits/rejected": -1.0280050039291382, "logps/chosen": -731.7576904296875, "logps/rejected": -982.4480590820312, "loss": 0.3241, "rewards/accuracies": 0.875, "rewards/chosen": -4.50623083114624, "rewards/margins": 2.7449519634246826, "rewards/rejected": -7.251182556152344, "step": 10190 }, { "epoch": 0.67, "grad_norm": 22.25, "learning_rate": 1.5039972753843966e-06, "logits/chosen": -1.376997709274292, "logits/rejected": -0.44482851028442383, "logps/chosen": -719.293701171875, "logps/rejected": -919.39892578125, "loss": 0.3541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.132431983947754, "rewards/margins": 2.9987435340881348, "rewards/rejected": -7.131174564361572, "step": 10200 }, { "epoch": 0.67, "eval_logits/chosen": -1.3058794736862183, "eval_logits/rejected": -0.901430070400238, "eval_logps/chosen": -744.9924926757812, "eval_logps/rejected": -936.1205444335938, "eval_loss": 0.5515990257263184, "eval_rewards/accuracies": 0.7455000281333923, "eval_rewards/chosen": -4.8037285804748535, "eval_rewards/margins": 2.1117615699768066, "eval_rewards/rejected": -6.915489673614502, "eval_runtime": 1082.1989, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10200 }, { "epoch": 0.67, "grad_norm": 3.6875, "learning_rate": 1.4987623030907955e-06, "logits/chosen": -1.0742769241333008, "logits/rejected": -1.5026159286499023, "logps/chosen": -784.8377075195312, "logps/rejected": -934.9825439453125, "loss": 0.8085, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.129917144775391, "rewards/margins": 1.7972902059555054, "rewards/rejected": -6.927206993103027, "step": 10210 }, { "epoch": 0.67, "grad_norm": 37.25, "learning_rate": 1.4935325545029916e-06, "logits/chosen": -1.2842559814453125, "logits/rejected": -0.8180631399154663, "logps/chosen": -775.5546264648438, "logps/rejected": -1051.7867431640625, "loss": 0.4874, "rewards/accuracies": 0.75, "rewards/chosen": -4.635405540466309, "rewards/margins": 2.898428440093994, "rewards/rejected": -7.5338335037231445, "step": 10220 }, { "epoch": 0.67, "grad_norm": 37.75, "learning_rate": 1.4883080569058818e-06, "logits/chosen": -1.5767090320587158, "logits/rejected": -1.1484239101409912, "logps/chosen": -777.5963134765625, "logps/rejected": -1007.2506103515625, "loss": 0.5621, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.840424537658691, "rewards/margins": 2.1486549377441406, "rewards/rejected": -6.989078521728516, "step": 10230 }, { "epoch": 0.67, "grad_norm": 41.25, "learning_rate": 1.4830888375569675e-06, "logits/chosen": -1.4342122077941895, "logits/rejected": -0.706005871295929, "logps/chosen": -710.0197143554688, "logps/rejected": -885.4022216796875, "loss": 0.6506, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.700556755065918, "rewards/margins": 2.388983964920044, "rewards/rejected": -7.089540958404541, "step": 10240 }, { "epoch": 0.67, "grad_norm": 20.875, "learning_rate": 1.4778749236862134e-06, "logits/chosen": -1.479411005973816, "logits/rejected": -0.8337879180908203, "logps/chosen": -859.5596923828125, "logps/rejected": -977.1931762695312, "loss": 0.5361, "rewards/accuracies": 0.75, "rewards/chosen": -5.265883445739746, "rewards/margins": 1.8133106231689453, "rewards/rejected": -7.079194068908691, "step": 10250 }, { "epoch": 0.67, "grad_norm": 69.5, "learning_rate": 1.472666342495903e-06, "logits/chosen": -1.250914216041565, "logits/rejected": -0.698544442653656, "logps/chosen": -684.0372314453125, "logps/rejected": -851.42529296875, "loss": 0.8256, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.657368183135986, "rewards/margins": 1.7373440265655518, "rewards/rejected": -6.394711971282959, "step": 10260 }, { "epoch": 0.67, "grad_norm": 28.625, "learning_rate": 1.4674631211604978e-06, "logits/chosen": -1.4498988389968872, "logits/rejected": -0.037291597574949265, "logps/chosen": -743.4136962890625, "logps/rejected": -819.1989135742188, "loss": 0.6323, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.967642784118652, "rewards/margins": 1.5931787490844727, "rewards/rejected": -6.560821533203125, "step": 10270 }, { "epoch": 0.67, "grad_norm": 17.0, "learning_rate": 1.4622652868264965e-06, "logits/chosen": -1.2123650312423706, "logits/rejected": -1.161731481552124, "logps/chosen": -716.0472412109375, "logps/rejected": -916.1256713867188, "loss": 0.8868, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.776013374328613, "rewards/margins": 1.9316222667694092, "rewards/rejected": -6.707635402679443, "step": 10280 }, { "epoch": 0.67, "grad_norm": 47.25, "learning_rate": 1.4570728666122924e-06, "logits/chosen": -1.532679796218872, "logits/rejected": -0.904069721698761, "logps/chosen": -752.6238403320312, "logps/rejected": -906.2546997070312, "loss": 0.4962, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.429513931274414, "rewards/margins": 2.136923313140869, "rewards/rejected": -6.566436767578125, "step": 10290 }, { "epoch": 0.67, "grad_norm": 6.125, "learning_rate": 1.4518858876080328e-06, "logits/chosen": -1.7310794591903687, "logits/rejected": -1.2071670293807983, "logps/chosen": -704.26611328125, "logps/rejected": -914.1529541015625, "loss": 0.3868, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.3411407470703125, "rewards/margins": 2.1002402305603027, "rewards/rejected": -6.441380977630615, "step": 10300 }, { "epoch": 0.67, "eval_logits/chosen": -1.4423774480819702, "eval_logits/rejected": -1.064092993736267, "eval_logps/chosen": -681.6423950195312, "eval_logps/rejected": -849.95849609375, "eval_loss": 0.5256307721138, "eval_rewards/accuracies": 0.7484999895095825, "eval_rewards/chosen": -4.170227527618408, "eval_rewards/margins": 1.8836411237716675, "eval_rewards/rejected": -6.053868770599365, "eval_runtime": 1082.2366, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10300 }, { "epoch": 0.67, "grad_norm": 40.5, "learning_rate": 1.4467043768754732e-06, "logits/chosen": -1.4783813953399658, "logits/rejected": -0.8723516464233398, "logps/chosen": -632.9873046875, "logps/rejected": -734.494873046875, "loss": 0.5848, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.1570844650268555, "rewards/margins": 1.5259284973144531, "rewards/rejected": -5.683012962341309, "step": 10310 }, { "epoch": 0.68, "grad_norm": 46.25, "learning_rate": 1.4415283614478436e-06, "logits/chosen": -1.6210472583770752, "logits/rejected": -1.0158146619796753, "logps/chosen": -753.3812255859375, "logps/rejected": -783.76123046875, "loss": 0.6333, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.159543991088867, "rewards/margins": 1.3206011056900024, "rewards/rejected": -5.48014497756958, "step": 10320 }, { "epoch": 0.68, "grad_norm": 13.0625, "learning_rate": 1.4363578683297035e-06, "logits/chosen": -1.2895166873931885, "logits/rejected": -1.1559741497039795, "logps/chosen": -586.9996948242188, "logps/rejected": -721.89404296875, "loss": 0.6977, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.675889253616333, "rewards/margins": 1.735560655593872, "rewards/rejected": -5.411450386047363, "step": 10330 }, { "epoch": 0.68, "grad_norm": 19.5, "learning_rate": 1.431192924496798e-06, "logits/chosen": -1.1851202249526978, "logits/rejected": -1.3362702131271362, "logps/chosen": -595.7273559570312, "logps/rejected": -783.95166015625, "loss": 0.5527, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.5822594165802, "rewards/margins": 1.7363201379776, "rewards/rejected": -5.31857967376709, "step": 10340 }, { "epoch": 0.68, "grad_norm": 24.375, "learning_rate": 1.4260335568959228e-06, "logits/chosen": -1.3567262887954712, "logits/rejected": -0.5422624945640564, "logps/chosen": -651.248291015625, "logps/rejected": -795.8531494140625, "loss": 0.3788, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.654700517654419, "rewards/margins": 2.1774659156799316, "rewards/rejected": -5.8321661949157715, "step": 10350 }, { "epoch": 0.68, "grad_norm": 3.65625, "learning_rate": 1.4208797924447814e-06, "logits/chosen": -1.449052333831787, "logits/rejected": -1.1186730861663818, "logps/chosen": -583.6400756835938, "logps/rejected": -784.7369995117188, "loss": 0.4041, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4223074913024902, "rewards/margins": 2.250852584838867, "rewards/rejected": -5.673160552978516, "step": 10360 }, { "epoch": 0.68, "grad_norm": 51.25, "learning_rate": 1.4157316580318414e-06, "logits/chosen": -1.2206075191497803, "logits/rejected": -1.4851930141448975, "logps/chosen": -587.5849609375, "logps/rejected": -774.3282470703125, "loss": 0.4676, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.817155361175537, "rewards/margins": 1.734375238418579, "rewards/rejected": -5.5515313148498535, "step": 10370 }, { "epoch": 0.68, "grad_norm": 30.375, "learning_rate": 1.4105891805162003e-06, "logits/chosen": -1.5946458578109741, "logits/rejected": -0.7892603278160095, "logps/chosen": -673.9332275390625, "logps/rejected": -838.1688232421875, "loss": 0.4946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.209865570068359, "rewards/margins": 1.923911690711975, "rewards/rejected": -6.133776664733887, "step": 10380 }, { "epoch": 0.68, "grad_norm": 13.75, "learning_rate": 1.40545238672744e-06, "logits/chosen": -1.582629919052124, "logits/rejected": -0.8954054117202759, "logps/chosen": -689.3759155273438, "logps/rejected": -840.8883056640625, "loss": 0.6056, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.158458709716797, "rewards/margins": 1.5511566400527954, "rewards/rejected": -5.709615230560303, "step": 10390 }, { "epoch": 0.68, "grad_norm": 14.125, "learning_rate": 1.4003213034654906e-06, "logits/chosen": -1.4868202209472656, "logits/rejected": -1.3521538972854614, "logps/chosen": -666.3098754882812, "logps/rejected": -823.4846801757812, "loss": 0.6851, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.327915191650391, "rewards/margins": 1.5320122241973877, "rewards/rejected": -5.859927177429199, "step": 10400 }, { "epoch": 0.68, "eval_logits/chosen": -1.4800106287002563, "eval_logits/rejected": -1.1069092750549316, "eval_logps/chosen": -671.8285522460938, "eval_logps/rejected": -836.0790405273438, "eval_loss": 0.5217891931533813, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -4.072089195251465, "eval_rewards/margins": 1.8429834842681885, "eval_rewards/rejected": -5.915073394775391, "eval_runtime": 1082.2778, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10400 }, { "epoch": 0.68, "grad_norm": 14.6875, "learning_rate": 1.395195957500488e-06, "logits/chosen": -1.218768835067749, "logits/rejected": -1.601723074913025, "logps/chosen": -614.4114990234375, "logps/rejected": -828.421875, "loss": 0.3955, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8022053241729736, "rewards/margins": 2.050584316253662, "rewards/rejected": -5.852789402008057, "step": 10410 }, { "epoch": 0.68, "grad_norm": 33.25, "learning_rate": 1.390076375572636e-06, "logits/chosen": -1.5681931972503662, "logits/rejected": -1.5263299942016602, "logps/chosen": -640.3341674804688, "logps/rejected": -817.5781860351562, "loss": 0.6319, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.091761589050293, "rewards/margins": 1.654823899269104, "rewards/rejected": -5.746584892272949, "step": 10420 }, { "epoch": 0.68, "grad_norm": 17.25, "learning_rate": 1.3849625843920633e-06, "logits/chosen": -1.3671653270721436, "logits/rejected": -1.5939397811889648, "logps/chosen": -613.932861328125, "logps/rejected": -790.2047729492188, "loss": 0.7163, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.049537181854248, "rewards/margins": 1.5071055889129639, "rewards/rejected": -5.556643009185791, "step": 10430 }, { "epoch": 0.68, "grad_norm": 64.5, "learning_rate": 1.3798546106386906e-06, "logits/chosen": -1.612593650817871, "logits/rejected": -1.2519499063491821, "logps/chosen": -614.6940307617188, "logps/rejected": -789.6730346679688, "loss": 0.4157, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5878453254699707, "rewards/margins": 2.089754104614258, "rewards/rejected": -5.677599906921387, "step": 10440 }, { "epoch": 0.68, "grad_norm": 27.125, "learning_rate": 1.3747524809620852e-06, "logits/chosen": -1.3727446794509888, "logits/rejected": -1.1501520872116089, "logps/chosen": -626.4266357421875, "logps/rejected": -786.50634765625, "loss": 0.5497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.6604182720184326, "rewards/margins": 1.4629747867584229, "rewards/rejected": -5.1233930587768555, "step": 10450 }, { "epoch": 0.68, "grad_norm": 19.125, "learning_rate": 1.3696562219813263e-06, "logits/chosen": -1.6653640270233154, "logits/rejected": -0.900460422039032, "logps/chosen": -682.3475341796875, "logps/rejected": -852.4351806640625, "loss": 0.5146, "rewards/accuracies": 0.625, "rewards/chosen": -4.114926338195801, "rewards/margins": 1.6670290231704712, "rewards/rejected": -5.781955718994141, "step": 10460 }, { "epoch": 0.69, "grad_norm": 9.625, "learning_rate": 1.364565860284861e-06, "logits/chosen": -1.534045934677124, "logits/rejected": -1.2816047668457031, "logps/chosen": -611.478759765625, "logps/rejected": -808.5330810546875, "loss": 0.351, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5042824745178223, "rewards/margins": 2.3347010612487793, "rewards/rejected": -5.838983535766602, "step": 10470 }, { "epoch": 0.69, "grad_norm": 27.625, "learning_rate": 1.3594814224303724e-06, "logits/chosen": -1.6841466426849365, "logits/rejected": -1.5560023784637451, "logps/chosen": -606.03515625, "logps/rejected": -803.71435546875, "loss": 0.5314, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4862639904022217, "rewards/margins": 1.7429144382476807, "rewards/rejected": -5.229178428649902, "step": 10480 }, { "epoch": 0.69, "grad_norm": 19.625, "learning_rate": 1.354402934944636e-06, "logits/chosen": -1.338861346244812, "logits/rejected": -1.206038236618042, "logps/chosen": -723.1608276367188, "logps/rejected": -859.28857421875, "loss": 0.6939, "rewards/accuracies": 0.75, "rewards/chosen": -4.489975452423096, "rewards/margins": 1.2781856060028076, "rewards/rejected": -5.768161296844482, "step": 10490 }, { "epoch": 0.69, "grad_norm": 13.75, "learning_rate": 1.3493304243233802e-06, "logits/chosen": -1.2385965585708618, "logits/rejected": -1.0162632465362549, "logps/chosen": -697.4874267578125, "logps/rejected": -827.4822998046875, "loss": 0.619, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.165760040283203, "rewards/margins": 1.832014799118042, "rewards/rejected": -5.997774600982666, "step": 10500 }, { "epoch": 0.69, "eval_logits/chosen": -1.501827359199524, "eval_logits/rejected": -1.125015139579773, "eval_logps/chosen": -660.5464477539062, "eval_logps/rejected": -822.1693725585938, "eval_loss": 0.5218930840492249, "eval_rewards/accuracies": 0.7475000023841858, "eval_rewards/chosen": -3.959268569946289, "eval_rewards/margins": 1.8167084455490112, "eval_rewards/rejected": -5.77597713470459, "eval_runtime": 1082.1793, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10500 }, { "epoch": 0.69, "grad_norm": 10.375, "learning_rate": 1.344263917031156e-06, "logits/chosen": -1.4388107061386108, "logits/rejected": -1.028540849685669, "logps/chosen": -705.0488891601562, "logps/rejected": -833.9869995117188, "loss": 0.5032, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.708178758621216, "rewards/margins": 1.6328964233398438, "rewards/rejected": -5.341075420379639, "step": 10510 }, { "epoch": 0.69, "grad_norm": 20.375, "learning_rate": 1.3392034395011906e-06, "logits/chosen": -1.4455511569976807, "logits/rejected": -0.5793593525886536, "logps/chosen": -720.4301147460938, "logps/rejected": -946.7024536132812, "loss": 0.6909, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.392982482910156, "rewards/margins": 1.8860929012298584, "rewards/rejected": -6.2790751457214355, "step": 10520 }, { "epoch": 0.69, "grad_norm": 6.3125, "learning_rate": 1.3341490181352502e-06, "logits/chosen": -1.2073533535003662, "logits/rejected": -1.0873521566390991, "logps/chosen": -643.8741455078125, "logps/rejected": -745.1998901367188, "loss": 0.6918, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.9829812049865723, "rewards/margins": 1.419801115989685, "rewards/rejected": -5.402783393859863, "step": 10530 }, { "epoch": 0.69, "grad_norm": 59.25, "learning_rate": 1.3291006793035075e-06, "logits/chosen": -1.340124487876892, "logits/rejected": -1.0445994138717651, "logps/chosen": -666.6505126953125, "logps/rejected": -809.2142333984375, "loss": 0.6195, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.7942779064178467, "rewards/margins": 1.489195466041565, "rewards/rejected": -5.283473491668701, "step": 10540 }, { "epoch": 0.69, "grad_norm": 48.5, "learning_rate": 1.3240584493444018e-06, "logits/chosen": -1.6340293884277344, "logits/rejected": -1.2340087890625, "logps/chosen": -743.8914794921875, "logps/rejected": -803.8734130859375, "loss": 0.6104, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.301347255706787, "rewards/margins": 1.1212438344955444, "rewards/rejected": -5.422590732574463, "step": 10550 }, { "epoch": 0.69, "grad_norm": 11.1875, "learning_rate": 1.319022354564497e-06, "logits/chosen": -1.1100943088531494, "logits/rejected": -1.642254114151001, "logps/chosen": -604.4169921875, "logps/rejected": -871.1263427734375, "loss": 0.4342, "rewards/accuracies": 0.75, "rewards/chosen": -3.9603874683380127, "rewards/margins": 2.026625633239746, "rewards/rejected": -5.987013339996338, "step": 10560 }, { "epoch": 0.69, "grad_norm": 68.5, "learning_rate": 1.3139924212383525e-06, "logits/chosen": -1.7579749822616577, "logits/rejected": -1.1095645427703857, "logps/chosen": -732.7344360351562, "logps/rejected": -800.1393432617188, "loss": 0.5293, "rewards/accuracies": 0.75, "rewards/chosen": -3.811056613922119, "rewards/margins": 1.96062433719635, "rewards/rejected": -5.771681308746338, "step": 10570 }, { "epoch": 0.69, "grad_norm": 5.8125, "learning_rate": 1.30896867560838e-06, "logits/chosen": -1.0329506397247314, "logits/rejected": -1.313632607460022, "logps/chosen": -579.6593017578125, "logps/rejected": -731.5431518554688, "loss": 0.6399, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.7588798999786377, "rewards/margins": 1.3775393962860107, "rewards/rejected": -5.136419773101807, "step": 10580 }, { "epoch": 0.69, "grad_norm": 37.0, "learning_rate": 1.303951143884707e-06, "logits/chosen": -1.530464768409729, "logits/rejected": -1.249775767326355, "logps/chosen": -676.1886596679688, "logps/rejected": -833.80322265625, "loss": 0.7124, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.8721930980682373, "rewards/margins": 1.6735633611679077, "rewards/rejected": -5.5457563400268555, "step": 10590 }, { "epoch": 0.69, "grad_norm": 46.75, "learning_rate": 1.2989398522450436e-06, "logits/chosen": -1.2165011167526245, "logits/rejected": -1.2555465698242188, "logps/chosen": -563.5174560546875, "logps/rejected": -723.0122680664062, "loss": 0.6235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.6923885345458984, "rewards/margins": 1.6297041177749634, "rewards/rejected": -5.3220930099487305, "step": 10600 }, { "epoch": 0.69, "eval_logits/chosen": -1.5597580671310425, "eval_logits/rejected": -1.2033050060272217, "eval_logps/chosen": -633.8964233398438, "eval_logps/rejected": -785.8031616210938, "eval_loss": 0.5139493346214294, "eval_rewards/accuracies": 0.7459999918937683, "eval_rewards/chosen": -3.6927685737609863, "eval_rewards/margins": 1.7195465564727783, "eval_rewards/rejected": -5.412314414978027, "eval_runtime": 1082.1511, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10600 }, { "epoch": 0.69, "grad_norm": 15.4375, "learning_rate": 1.2939348268345436e-06, "logits/chosen": -1.067735195159912, "logits/rejected": -0.4283299446105957, "logps/chosen": -616.9149169921875, "logps/rejected": -788.9300537109375, "loss": 0.3782, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.741664171218872, "rewards/margins": 2.500152111053467, "rewards/rejected": -6.241816520690918, "step": 10610 }, { "epoch": 0.69, "grad_norm": 23.125, "learning_rate": 1.288936093765668e-06, "logits/chosen": -1.3233901262283325, "logits/rejected": -0.9915789365768433, "logps/chosen": -651.1336059570312, "logps/rejected": -823.0012817382812, "loss": 0.7163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.8418705463409424, "rewards/margins": 1.793863296508789, "rewards/rejected": -5.6357340812683105, "step": 10620 }, { "epoch": 0.7, "grad_norm": 11.0625, "learning_rate": 1.2839436791180487e-06, "logits/chosen": -1.3945049047470093, "logits/rejected": -1.6207764148712158, "logps/chosen": -642.7286987304688, "logps/rejected": -690.32958984375, "loss": 0.7785, "rewards/accuracies": 0.625, "rewards/chosen": -3.6383004188537598, "rewards/margins": 0.8154259920120239, "rewards/rejected": -4.453726768493652, "step": 10630 }, { "epoch": 0.7, "grad_norm": 94.5, "learning_rate": 1.2789576089383542e-06, "logits/chosen": -1.6258407831192017, "logits/rejected": -1.2548949718475342, "logps/chosen": -660.3040771484375, "logps/rejected": -789.8626708984375, "loss": 0.4794, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8556904792785645, "rewards/margins": 1.683984398841858, "rewards/rejected": -5.539674758911133, "step": 10640 }, { "epoch": 0.7, "grad_norm": 38.5, "learning_rate": 1.27397790924015e-06, "logits/chosen": -1.568908452987671, "logits/rejected": -1.3907241821289062, "logps/chosen": -691.8917846679688, "logps/rejected": -764.9741821289062, "loss": 0.497, "rewards/accuracies": 0.75, "rewards/chosen": -3.7092201709747314, "rewards/margins": 1.3613128662109375, "rewards/rejected": -5.070533275604248, "step": 10650 }, { "epoch": 0.7, "grad_norm": 11.5, "learning_rate": 1.2690046060037661e-06, "logits/chosen": -1.4752072095870972, "logits/rejected": -1.213343620300293, "logps/chosen": -619.9736938476562, "logps/rejected": -749.9158935546875, "loss": 0.6049, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7960498332977295, "rewards/margins": 1.249610185623169, "rewards/rejected": -5.04565954208374, "step": 10660 }, { "epoch": 0.7, "grad_norm": 16.625, "learning_rate": 1.2640377251761632e-06, "logits/chosen": -1.6135237216949463, "logits/rejected": -1.1512764692306519, "logps/chosen": -642.0647583007812, "logps/rejected": -737.703125, "loss": 0.5079, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.570931911468506, "rewards/margins": 1.403157353401184, "rewards/rejected": -4.974089622497559, "step": 10670 }, { "epoch": 0.7, "grad_norm": 36.75, "learning_rate": 1.2590772926707897e-06, "logits/chosen": -1.4328186511993408, "logits/rejected": -0.9118412137031555, "logps/chosen": -628.4808349609375, "logps/rejected": -870.8009033203125, "loss": 0.3965, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.671886920928955, "rewards/margins": 2.6431522369384766, "rewards/rejected": -6.31503963470459, "step": 10680 }, { "epoch": 0.7, "grad_norm": 12.5, "learning_rate": 1.2541233343674557e-06, "logits/chosen": -1.4377892017364502, "logits/rejected": -1.4795506000518799, "logps/chosen": -632.162109375, "logps/rejected": -811.23583984375, "loss": 0.4193, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.7411530017852783, "rewards/margins": 1.7058792114257812, "rewards/rejected": -5.4470319747924805, "step": 10690 }, { "epoch": 0.7, "grad_norm": 13.6875, "learning_rate": 1.2491758761121925e-06, "logits/chosen": -1.5750110149383545, "logits/rejected": -1.1230591535568237, "logps/chosen": -630.9482421875, "logps/rejected": -869.01904296875, "loss": 0.3952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.911965847015381, "rewards/margins": 2.5487546920776367, "rewards/rejected": -6.460721015930176, "step": 10700 }, { "epoch": 0.7, "eval_logits/chosen": -1.5122100114822388, "eval_logits/rejected": -1.1462620496749878, "eval_logps/chosen": -660.5130615234375, "eval_logps/rejected": -815.05517578125, "eval_loss": 0.5147351622581482, "eval_rewards/accuracies": 0.7524999976158142, "eval_rewards/chosen": -3.9589345455169678, "eval_rewards/margins": 1.7458993196487427, "eval_rewards/rejected": -5.704833507537842, "eval_runtime": 1082.3111, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10700 }, { "epoch": 0.7, "grad_norm": 12.9375, "learning_rate": 1.2442349437171194e-06, "logits/chosen": -1.4359997510910034, "logits/rejected": -1.066007375717163, "logps/chosen": -590.2700805664062, "logps/rejected": -779.9317626953125, "loss": 0.4838, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8524861335754395, "rewards/margins": 1.9884151220321655, "rewards/rejected": -5.8409013748168945, "step": 10710 }, { "epoch": 0.7, "grad_norm": 17.0, "learning_rate": 1.2393005629603052e-06, "logits/chosen": -1.4606658220291138, "logits/rejected": -0.7708011269569397, "logps/chosen": -699.1524658203125, "logps/rejected": -821.6818237304688, "loss": 0.517, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.24437952041626, "rewards/margins": 1.7233489751815796, "rewards/rejected": -5.967728137969971, "step": 10720 }, { "epoch": 0.7, "grad_norm": 75.5, "learning_rate": 1.2343727595856447e-06, "logits/chosen": -1.2511651515960693, "logits/rejected": -1.017780065536499, "logps/chosen": -630.4780883789062, "logps/rejected": -811.3390502929688, "loss": 0.5236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.128957271575928, "rewards/margins": 1.7441476583480835, "rewards/rejected": -5.873105049133301, "step": 10730 }, { "epoch": 0.7, "grad_norm": 17.0, "learning_rate": 1.2294515593027088e-06, "logits/chosen": -1.1309648752212524, "logits/rejected": -0.8622885942459106, "logps/chosen": -623.2745971679688, "logps/rejected": -788.3549194335938, "loss": 0.585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.202927589416504, "rewards/margins": 1.444232702255249, "rewards/rejected": -5.647160530090332, "step": 10740 }, { "epoch": 0.7, "grad_norm": 28.0, "learning_rate": 1.2245369877866229e-06, "logits/chosen": -1.5482370853424072, "logits/rejected": -1.127922773361206, "logps/chosen": -666.7379760742188, "logps/rejected": -776.2698974609375, "loss": 0.6505, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.941754102706909, "rewards/margins": 1.2070884704589844, "rewards/rejected": -5.148842811584473, "step": 10750 }, { "epoch": 0.7, "grad_norm": 69.0, "learning_rate": 1.2196290706779287e-06, "logits/chosen": -1.0613011121749878, "logits/rejected": -0.9187415838241577, "logps/chosen": -737.3240966796875, "logps/rejected": -873.4812622070312, "loss": 0.4422, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.767219543457031, "rewards/margins": 1.7604787349700928, "rewards/rejected": -6.527698516845703, "step": 10760 }, { "epoch": 0.7, "grad_norm": 17.875, "learning_rate": 1.214727833582447e-06, "logits/chosen": -1.3208668231964111, "logits/rejected": -1.133458137512207, "logps/chosen": -625.22998046875, "logps/rejected": -808.7977294921875, "loss": 0.4758, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.021389961242676, "rewards/margins": 1.4423021078109741, "rewards/rejected": -5.463692665100098, "step": 10770 }, { "epoch": 0.71, "grad_norm": 23.0, "learning_rate": 1.20983330207115e-06, "logits/chosen": -0.9682712554931641, "logits/rejected": -1.2549220323562622, "logps/chosen": -611.6253051757812, "logps/rejected": -775.5317993164062, "loss": 0.3845, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.8759970664978027, "rewards/margins": 1.988223671913147, "rewards/rejected": -5.86422061920166, "step": 10780 }, { "epoch": 0.71, "grad_norm": 92.0, "learning_rate": 1.2049455016800248e-06, "logits/chosen": -1.4365875720977783, "logits/rejected": -1.2553602457046509, "logps/chosen": -592.73876953125, "logps/rejected": -792.6962890625, "loss": 0.4752, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.607112169265747, "rewards/margins": 2.1171605587005615, "rewards/rejected": -5.724272727966309, "step": 10790 }, { "epoch": 0.71, "grad_norm": 32.25, "learning_rate": 1.2000644579099415e-06, "logits/chosen": -1.3111979961395264, "logits/rejected": -1.166677713394165, "logps/chosen": -558.2572021484375, "logps/rejected": -734.44384765625, "loss": 0.4521, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.7623653411865234, "rewards/margins": 1.8486125469207764, "rewards/rejected": -5.610977649688721, "step": 10800 }, { "epoch": 0.71, "eval_logits/chosen": -1.4513952732086182, "eval_logits/rejected": -1.076482892036438, "eval_logps/chosen": -693.2052001953125, "eval_logps/rejected": -855.6591186523438, "eval_loss": 0.5215042233467102, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.285856246948242, "eval_rewards/margins": 1.8250185251235962, "eval_rewards/rejected": -6.110874652862549, "eval_runtime": 1082.1803, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10800 }, { "epoch": 0.71, "grad_norm": 25.75, "learning_rate": 1.1951901962265155e-06, "logits/chosen": -1.501696228981018, "logits/rejected": -0.9908178448677063, "logps/chosen": -709.2275390625, "logps/rejected": -933.8816528320312, "loss": 0.3624, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.2261505126953125, "rewards/margins": 2.511664628982544, "rewards/rejected": -6.737815856933594, "step": 10810 }, { "epoch": 0.71, "grad_norm": 12.125, "learning_rate": 1.1903227420599824e-06, "logits/chosen": -1.9677479267120361, "logits/rejected": -0.2535783648490906, "logps/chosen": -766.1469116210938, "logps/rejected": -893.7091064453125, "loss": 0.3744, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.5134429931640625, "rewards/margins": 2.2386910915374756, "rewards/rejected": -6.752133846282959, "step": 10820 }, { "epoch": 0.71, "grad_norm": 11.25, "learning_rate": 1.1854621208050596e-06, "logits/chosen": -1.5689867734909058, "logits/rejected": -1.3838443756103516, "logps/chosen": -766.6506958007812, "logps/rejected": -986.9143676757812, "loss": 0.5179, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.5301923751831055, "rewards/margins": 2.312378168106079, "rewards/rejected": -6.8425703048706055, "step": 10830 }, { "epoch": 0.71, "grad_norm": 73.5, "learning_rate": 1.1806083578208153e-06, "logits/chosen": -1.382569670677185, "logits/rejected": -0.8673487901687622, "logps/chosen": -698.1746826171875, "logps/rejected": -871.6192626953125, "loss": 0.6535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.4911603927612305, "rewards/margins": 1.9680321216583252, "rewards/rejected": -6.459192752838135, "step": 10840 }, { "epoch": 0.71, "grad_norm": 8.0, "learning_rate": 1.1757614784305372e-06, "logits/chosen": -1.3576117753982544, "logits/rejected": -0.9826158285140991, "logps/chosen": -694.575439453125, "logps/rejected": -907.1066284179688, "loss": 0.5816, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.2628679275512695, "rewards/margins": 2.1125435829162598, "rewards/rejected": -6.3754119873046875, "step": 10850 }, { "epoch": 0.71, "grad_norm": 24.375, "learning_rate": 1.1709215079215999e-06, "logits/chosen": -1.047668695449829, "logits/rejected": -1.2260911464691162, "logps/chosen": -719.9637451171875, "logps/rejected": -818.7568359375, "loss": 0.8756, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.493109703063965, "rewards/margins": 1.1381580829620361, "rewards/rejected": -5.631267547607422, "step": 10860 }, { "epoch": 0.71, "grad_norm": 49.0, "learning_rate": 1.1660884715453301e-06, "logits/chosen": -1.559966802597046, "logits/rejected": -0.8893915414810181, "logps/chosen": -709.776611328125, "logps/rejected": -908.28564453125, "loss": 0.5262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.422757625579834, "rewards/margins": 2.0521726608276367, "rewards/rejected": -6.474930763244629, "step": 10870 }, { "epoch": 0.71, "grad_norm": 16.5, "learning_rate": 1.1612623945168804e-06, "logits/chosen": -1.2266969680786133, "logits/rejected": -1.0579445362091064, "logps/chosen": -653.5391845703125, "logps/rejected": -842.2947998046875, "loss": 0.6602, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.734743595123291, "rewards/margins": 1.4458787441253662, "rewards/rejected": -6.180622577667236, "step": 10880 }, { "epoch": 0.71, "grad_norm": 22.0, "learning_rate": 1.1564433020150946e-06, "logits/chosen": -1.3439654111862183, "logits/rejected": -1.3570594787597656, "logps/chosen": -763.9058837890625, "logps/rejected": -902.9019775390625, "loss": 0.6042, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.61073112487793, "rewards/margins": 1.4340050220489502, "rewards/rejected": -6.044736385345459, "step": 10890 }, { "epoch": 0.71, "grad_norm": 89.5, "learning_rate": 1.1516312191823725e-06, "logits/chosen": -1.1360535621643066, "logits/rejected": -0.8433725237846375, "logps/chosen": -734.4967041015625, "logps/rejected": -884.2633666992188, "loss": 0.7094, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.9538750648498535, "rewards/margins": 1.4453742504119873, "rewards/rejected": -6.399249076843262, "step": 10900 }, { "epoch": 0.71, "eval_logits/chosen": -1.4484448432922363, "eval_logits/rejected": -1.0677849054336548, "eval_logps/chosen": -688.0204467773438, "eval_logps/rejected": -848.9450073242188, "eval_loss": 0.5195026397705078, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -4.234007835388184, "eval_rewards/margins": 1.8097251653671265, "eval_rewards/rejected": -6.0437331199646, "eval_runtime": 1082.1, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 10900 }, { "epoch": 0.71, "grad_norm": 27.0, "learning_rate": 1.1468261711245479e-06, "logits/chosen": -1.409387230873108, "logits/rejected": -1.189150094985962, "logps/chosen": -814.6851196289062, "logps/rejected": -1000.2013549804688, "loss": 0.4752, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.727694988250732, "rewards/margins": 2.2806029319763184, "rewards/rejected": -7.008297920227051, "step": 10910 }, { "epoch": 0.71, "grad_norm": 19.875, "learning_rate": 1.1420281829107506e-06, "logits/chosen": -1.7946569919586182, "logits/rejected": -1.4083147048950195, "logps/chosen": -689.6478271484375, "logps/rejected": -849.3262939453125, "loss": 0.5052, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.176295280456543, "rewards/margins": 1.668029546737671, "rewards/rejected": -5.844324111938477, "step": 10920 }, { "epoch": 0.72, "grad_norm": 24.25, "learning_rate": 1.137237279573275e-06, "logits/chosen": -1.464867353439331, "logits/rejected": -0.9896612167358398, "logps/chosen": -711.1861572265625, "logps/rejected": -888.4918823242188, "loss": 0.5445, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.7422308921813965, "rewards/margins": 1.6512006521224976, "rewards/rejected": -6.393432140350342, "step": 10930 }, { "epoch": 0.72, "grad_norm": 35.5, "learning_rate": 1.1324534861074544e-06, "logits/chosen": -1.4844785928726196, "logits/rejected": -1.090965986251831, "logps/chosen": -745.7401123046875, "logps/rejected": -929.5675659179688, "loss": 0.4979, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.4492573738098145, "rewards/margins": 1.7915910482406616, "rewards/rejected": -6.240848064422607, "step": 10940 }, { "epoch": 0.72, "grad_norm": 16.625, "learning_rate": 1.1276768274715302e-06, "logits/chosen": -1.4116075038909912, "logits/rejected": -0.8608868718147278, "logps/chosen": -676.7122802734375, "logps/rejected": -860.9592895507812, "loss": 0.4128, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.969865083694458, "rewards/margins": 2.4479689598083496, "rewards/rejected": -6.4178338050842285, "step": 10950 }, { "epoch": 0.72, "grad_norm": 28.375, "learning_rate": 1.1229073285865146e-06, "logits/chosen": -1.871652364730835, "logits/rejected": -0.5481523275375366, "logps/chosen": -805.5091552734375, "logps/rejected": -871.3497314453125, "loss": 0.6799, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.663176536560059, "rewards/margins": 1.4890005588531494, "rewards/rejected": -6.152177333831787, "step": 10960 }, { "epoch": 0.72, "grad_norm": 125.0, "learning_rate": 1.118145014336069e-06, "logits/chosen": -1.2835125923156738, "logits/rejected": -1.1027185916900635, "logps/chosen": -669.7252807617188, "logps/rejected": -825.8621215820312, "loss": 0.571, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.330521583557129, "rewards/margins": 1.650475263595581, "rewards/rejected": -5.980997085571289, "step": 10970 }, { "epoch": 0.72, "grad_norm": 50.25, "learning_rate": 1.1133899095663706e-06, "logits/chosen": -1.7450100183486938, "logits/rejected": -1.0669329166412354, "logps/chosen": -757.4323120117188, "logps/rejected": -901.1731567382812, "loss": 0.6356, "rewards/accuracies": 0.75, "rewards/chosen": -4.705446243286133, "rewards/margins": 1.7028335332870483, "rewards/rejected": -6.4082794189453125, "step": 10980 }, { "epoch": 0.72, "grad_norm": 54.25, "learning_rate": 1.1086420390859806e-06, "logits/chosen": -1.6085093021392822, "logits/rejected": -1.0353999137878418, "logps/chosen": -731.6707763671875, "logps/rejected": -863.5408935546875, "loss": 0.525, "rewards/accuracies": 0.75, "rewards/chosen": -4.521809101104736, "rewards/margins": 1.7326877117156982, "rewards/rejected": -6.2544965744018555, "step": 10990 }, { "epoch": 0.72, "grad_norm": 52.5, "learning_rate": 1.1039014276657197e-06, "logits/chosen": -1.4804106950759888, "logits/rejected": -0.700351357460022, "logps/chosen": -842.9129638671875, "logps/rejected": -878.9481201171875, "loss": 0.6759, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.114873886108398, "rewards/margins": 1.228914737701416, "rewards/rejected": -6.3437886238098145, "step": 11000 }, { "epoch": 0.72, "eval_logits/chosen": -1.4572663307189941, "eval_logits/rejected": -1.0736693143844604, "eval_logps/chosen": -681.5213012695312, "eval_logps/rejected": -842.6663818359375, "eval_loss": 0.5183743238449097, "eval_rewards/accuracies": 0.7484999895095825, "eval_rewards/chosen": -4.1690168380737305, "eval_rewards/margins": 1.811930537223816, "eval_rewards/rejected": -5.980947494506836, "eval_runtime": 1082.9828, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 11000 }, { "epoch": 0.72, "grad_norm": 44.25, "learning_rate": 1.0991681000385363e-06, "logits/chosen": -1.3604611158370972, "logits/rejected": -0.5477991700172424, "logps/chosen": -668.8902587890625, "logps/rejected": -848.5079345703125, "loss": 0.371, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.88586688041687, "rewards/margins": 2.349989414215088, "rewards/rejected": -6.235856056213379, "step": 11010 }, { "epoch": 0.72, "grad_norm": 25.0, "learning_rate": 1.0944420808993747e-06, "logits/chosen": -1.6504539251327515, "logits/rejected": -0.9524500966072083, "logps/chosen": -644.7841796875, "logps/rejected": -851.1920776367188, "loss": 0.4128, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6666903495788574, "rewards/margins": 2.0562329292297363, "rewards/rejected": -5.7229228019714355, "step": 11020 }, { "epoch": 0.72, "grad_norm": 26.5, "learning_rate": 1.0897233949050518e-06, "logits/chosen": -1.4464709758758545, "logits/rejected": -0.9995682835578918, "logps/chosen": -668.1272583007812, "logps/rejected": -791.340087890625, "loss": 0.6269, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.209960460662842, "rewards/margins": 1.3581387996673584, "rewards/rejected": -5.568099021911621, "step": 11030 }, { "epoch": 0.72, "grad_norm": 25.375, "learning_rate": 1.0850120666741246e-06, "logits/chosen": -1.7755985260009766, "logits/rejected": -1.2400100231170654, "logps/chosen": -614.2630004882812, "logps/rejected": -721.1644287109375, "loss": 0.6113, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7826180458068848, "rewards/margins": 1.2631304264068604, "rewards/rejected": -5.045748710632324, "step": 11040 }, { "epoch": 0.72, "grad_norm": 17.375, "learning_rate": 1.0803081207867624e-06, "logits/chosen": -1.6154143810272217, "logits/rejected": -0.8008478879928589, "logps/chosen": -647.5601806640625, "logps/rejected": -799.0430908203125, "loss": 0.5421, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.961895704269409, "rewards/margins": 2.024866819381714, "rewards/rejected": -5.986762046813965, "step": 11050 }, { "epoch": 0.72, "grad_norm": 22.375, "learning_rate": 1.0756115817846203e-06, "logits/chosen": -1.3996070623397827, "logits/rejected": -1.084146499633789, "logps/chosen": -671.1369018554688, "logps/rejected": -896.6243286132812, "loss": 0.5705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.918748378753662, "rewards/margins": 2.561513900756836, "rewards/rejected": -6.48026180267334, "step": 11060 }, { "epoch": 0.72, "grad_norm": 5.71875, "learning_rate": 1.0709224741707095e-06, "logits/chosen": -1.130397915840149, "logits/rejected": -0.666817307472229, "logps/chosen": -689.0978393554688, "logps/rejected": -871.6126708984375, "loss": 0.5187, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.437009334564209, "rewards/margins": 2.1272177696228027, "rewards/rejected": -6.564227104187012, "step": 11070 }, { "epoch": 0.72, "grad_norm": 24.25, "learning_rate": 1.0662408224092674e-06, "logits/chosen": -1.4376206398010254, "logits/rejected": -1.254671573638916, "logps/chosen": -601.8767700195312, "logps/rejected": -764.027587890625, "loss": 0.3836, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3066115379333496, "rewards/margins": 2.107612133026123, "rewards/rejected": -5.414223670959473, "step": 11080 }, { "epoch": 0.73, "grad_norm": 16.875, "learning_rate": 1.0615666509256353e-06, "logits/chosen": -0.8650871515274048, "logits/rejected": -0.8224350214004517, "logps/chosen": -696.7799682617188, "logps/rejected": -841.8758544921875, "loss": 0.4576, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.398080825805664, "rewards/margins": 1.7414579391479492, "rewards/rejected": -6.139538764953613, "step": 11090 }, { "epoch": 0.73, "grad_norm": 29.25, "learning_rate": 1.056899984106127e-06, "logits/chosen": -1.048078179359436, "logits/rejected": -0.7443400621414185, "logps/chosen": -638.5119018554688, "logps/rejected": -807.7030029296875, "loss": 0.4752, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.9991562366485596, "rewards/margins": 1.8358609676361084, "rewards/rejected": -5.835017204284668, "step": 11100 }, { "epoch": 0.73, "eval_logits/chosen": -1.5326206684112549, "eval_logits/rejected": -1.1637712717056274, "eval_logps/chosen": -651.98974609375, "eval_logps/rejected": -807.3626708984375, "eval_loss": 0.5153878331184387, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -3.8737010955810547, "eval_rewards/margins": 1.7542093992233276, "eval_rewards/rejected": -5.627910137176514, "eval_runtime": 1082.9896, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 11100 }, { "epoch": 0.73, "grad_norm": 11.25, "learning_rate": 1.0522408462979036e-06, "logits/chosen": -1.6922399997711182, "logits/rejected": -1.291743278503418, "logps/chosen": -589.0997314453125, "logps/rejected": -685.0193481445312, "loss": 0.5407, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6088738441467285, "rewards/margins": 1.228177785873413, "rewards/rejected": -4.837052345275879, "step": 11110 }, { "epoch": 0.73, "grad_norm": 123.0, "learning_rate": 1.0475892618088426e-06, "logits/chosen": -1.3855421543121338, "logits/rejected": -1.1448593139648438, "logps/chosen": -707.102294921875, "logps/rejected": -833.8596801757812, "loss": 0.6073, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.2670392990112305, "rewards/margins": 1.4605811834335327, "rewards/rejected": -5.727620601654053, "step": 11120 }, { "epoch": 0.73, "grad_norm": 31.25, "learning_rate": 1.0429452549074167e-06, "logits/chosen": -1.5981324911117554, "logits/rejected": -1.0032703876495361, "logps/chosen": -658.7590942382812, "logps/rejected": -760.2555541992188, "loss": 0.5279, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.638666868209839, "rewards/margins": 1.5968847274780273, "rewards/rejected": -5.235550880432129, "step": 11130 }, { "epoch": 0.73, "grad_norm": 25.0, "learning_rate": 1.0383088498225646e-06, "logits/chosen": -1.5595321655273438, "logits/rejected": -1.1686902046203613, "logps/chosen": -652.8717041015625, "logps/rejected": -890.9627075195312, "loss": 0.449, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.788221836090088, "rewards/margins": 2.0910606384277344, "rewards/rejected": -5.879281997680664, "step": 11140 }, { "epoch": 0.73, "grad_norm": 7.375, "learning_rate": 1.0336800707435616e-06, "logits/chosen": -1.3866736888885498, "logits/rejected": -1.1973087787628174, "logps/chosen": -714.6318359375, "logps/rejected": -786.62744140625, "loss": 0.5174, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.9722352027893066, "rewards/margins": 1.5295145511627197, "rewards/rejected": -5.5017499923706055, "step": 11150 }, { "epoch": 0.73, "grad_norm": 19.125, "learning_rate": 1.0290589418198987e-06, "logits/chosen": -1.8074315786361694, "logits/rejected": -1.130959391593933, "logps/chosen": -679.8025512695312, "logps/rejected": -873.0748291015625, "loss": 0.3479, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.845219373703003, "rewards/margins": 2.3630409240722656, "rewards/rejected": -6.208260536193848, "step": 11160 }, { "epoch": 0.73, "grad_norm": 37.5, "learning_rate": 1.024445487161154e-06, "logits/chosen": -1.4735791683197021, "logits/rejected": -0.9718071818351746, "logps/chosen": -643.5035400390625, "logps/rejected": -833.1701049804688, "loss": 0.4611, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.056186676025391, "rewards/margins": 2.140188694000244, "rewards/rejected": -6.196375370025635, "step": 11170 }, { "epoch": 0.73, "grad_norm": 33.75, "learning_rate": 1.019839730836866e-06, "logits/chosen": -0.994766354560852, "logits/rejected": -1.3450130224227905, "logps/chosen": -661.3984985351562, "logps/rejected": -821.025390625, "loss": 0.5308, "rewards/accuracies": 0.75, "rewards/chosen": -4.219649314880371, "rewards/margins": 1.4945405721664429, "rewards/rejected": -5.714190483093262, "step": 11180 }, { "epoch": 0.73, "grad_norm": 21.875, "learning_rate": 1.01524169687641e-06, "logits/chosen": -1.3772180080413818, "logits/rejected": -0.6823263168334961, "logps/chosen": -648.3936157226562, "logps/rejected": -826.3736572265625, "loss": 0.5045, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8270130157470703, "rewards/margins": 2.2313995361328125, "rewards/rejected": -6.058412075042725, "step": 11190 }, { "epoch": 0.73, "grad_norm": 48.25, "learning_rate": 1.0106514092688723e-06, "logits/chosen": -1.4591482877731323, "logits/rejected": -0.9912242889404297, "logps/chosen": -754.6162109375, "logps/rejected": -912.1580200195312, "loss": 0.4382, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.1812334060668945, "rewards/margins": 2.285008192062378, "rewards/rejected": -6.46624231338501, "step": 11200 }, { "epoch": 0.73, "eval_logits/chosen": -1.5242676734924316, "eval_logits/rejected": -1.1532734632492065, "eval_logps/chosen": -664.0819702148438, "eval_logps/rejected": -824.1630859375, "eval_loss": 0.5193309187889099, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -3.9946234226226807, "eval_rewards/margins": 1.8012902736663818, "eval_rewards/rejected": -5.795914173126221, "eval_runtime": 1083.5948, "eval_samples_per_second": 1.846, "eval_steps_per_second": 1.846, "step": 11200 }, { "epoch": 0.73, "grad_norm": 15.5, "learning_rate": 1.006068891962922e-06, "logits/chosen": -1.5496050119400024, "logits/rejected": -1.3101630210876465, "logps/chosen": -709.2440185546875, "logps/rejected": -727.5101928710938, "loss": 0.6986, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.106968879699707, "rewards/margins": 1.0762665271759033, "rewards/rejected": -5.183234691619873, "step": 11210 }, { "epoch": 0.73, "grad_norm": 17.5, "learning_rate": 1.0014941688666903e-06, "logits/chosen": -1.7075217962265015, "logits/rejected": -0.9084409475326538, "logps/chosen": -651.8974609375, "logps/rejected": -868.818359375, "loss": 0.685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.090898036956787, "rewards/margins": 1.9089428186416626, "rewards/rejected": -5.99984073638916, "step": 11220 }, { "epoch": 0.73, "grad_norm": 5.875, "learning_rate": 9.969272638476462e-07, "logits/chosen": -1.6504061222076416, "logits/rejected": -1.2395288944244385, "logps/chosen": -685.8689575195312, "logps/rejected": -821.2249755859375, "loss": 0.4892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7263190746307373, "rewards/margins": 1.7704594135284424, "rewards/rejected": -5.496779441833496, "step": 11230 }, { "epoch": 0.74, "grad_norm": 43.75, "learning_rate": 9.923682007324657e-07, "logits/chosen": -1.3138939142227173, "logits/rejected": -1.286132574081421, "logps/chosen": -667.2005615234375, "logps/rejected": -772.8294067382812, "loss": 0.4726, "rewards/accuracies": 0.75, "rewards/chosen": -3.665712833404541, "rewards/margins": 1.6450169086456299, "rewards/rejected": -5.31072998046875, "step": 11240 }, { "epoch": 0.74, "grad_norm": 18.0, "learning_rate": 9.878170033069138e-07, "logits/chosen": -1.0629277229309082, "logits/rejected": -0.9022231101989746, "logps/chosen": -592.915283203125, "logps/rejected": -778.9974365234375, "loss": 0.5697, "rewards/accuracies": 0.75, "rewards/chosen": -3.9785568714141846, "rewards/margins": 1.5103801488876343, "rewards/rejected": -5.4889373779296875, "step": 11250 }, { "epoch": 0.74, "grad_norm": 23.625, "learning_rate": 9.832736953157194e-07, "logits/chosen": -1.6176369190216064, "logits/rejected": -1.4319775104522705, "logps/chosen": -599.7904052734375, "logps/rejected": -867.8541259765625, "loss": 0.5734, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.682180881500244, "rewards/margins": 2.528827667236328, "rewards/rejected": -6.211008548736572, "step": 11260 }, { "epoch": 0.74, "grad_norm": 4.125, "learning_rate": 9.787383004624462e-07, "logits/chosen": -1.6502044200897217, "logits/rejected": -1.1310429573059082, "logps/chosen": -663.2899169921875, "logps/rejected": -772.658447265625, "loss": 0.5567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.7299742698669434, "rewards/margins": 1.7795203924179077, "rewards/rejected": -5.509494781494141, "step": 11270 }, { "epoch": 0.74, "grad_norm": 30.75, "learning_rate": 9.742108424093787e-07, "logits/chosen": -1.568800449371338, "logits/rejected": -1.5011690855026245, "logps/chosen": -572.98779296875, "logps/rejected": -837.0050659179688, "loss": 0.4131, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.6964657306671143, "rewards/margins": 2.3272838592529297, "rewards/rejected": -6.023749351501465, "step": 11280 }, { "epoch": 0.74, "grad_norm": 20.625, "learning_rate": 9.696913447773907e-07, "logits/chosen": -1.2425655126571655, "logits/rejected": -1.2369608879089355, "logps/chosen": -658.2379760742188, "logps/rejected": -734.7034301757812, "loss": 0.7922, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.262181758880615, "rewards/margins": 1.1327687501907349, "rewards/rejected": -5.394951343536377, "step": 11290 }, { "epoch": 0.74, "grad_norm": 51.5, "learning_rate": 9.651798311458213e-07, "logits/chosen": -1.3925364017486572, "logits/rejected": -1.3539361953735352, "logps/chosen": -622.0708618164062, "logps/rejected": -800.9049072265625, "loss": 0.5666, "rewards/accuracies": 0.75, "rewards/chosen": -3.9556052684783936, "rewards/margins": 1.536245584487915, "rewards/rejected": -5.491851329803467, "step": 11300 }, { "epoch": 0.74, "eval_logits/chosen": -1.5188477039337158, "eval_logits/rejected": -1.1489087343215942, "eval_logps/chosen": -661.8636474609375, "eval_logps/rejected": -821.8570556640625, "eval_loss": 0.5179318189620972, "eval_rewards/accuracies": 0.7509999871253967, "eval_rewards/chosen": -3.972439765930176, "eval_rewards/margins": 1.8004143238067627, "eval_rewards/rejected": -5.772854328155518, "eval_runtime": 1083.0146, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 11300 }, { "epoch": 0.74, "grad_norm": 11.5, "learning_rate": 9.606763250523595e-07, "logits/chosen": -1.590217113494873, "logits/rejected": -0.2724389433860779, "logps/chosen": -647.0823974609375, "logps/rejected": -763.3814697265625, "loss": 0.4159, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.77226185798645, "rewards/margins": 1.8791347742080688, "rewards/rejected": -5.651396751403809, "step": 11310 }, { "epoch": 0.74, "grad_norm": 45.25, "learning_rate": 9.561808499929163e-07, "logits/chosen": -1.551206350326538, "logits/rejected": -1.228323221206665, "logps/chosen": -657.3067626953125, "logps/rejected": -836.9056396484375, "loss": 0.8319, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.890735626220703, "rewards/margins": 1.444014310836792, "rewards/rejected": -5.334750175476074, "step": 11320 }, { "epoch": 0.74, "grad_norm": 10.5625, "learning_rate": 9.516934294214994e-07, "logits/chosen": -1.5126091241836548, "logits/rejected": -1.1565630435943604, "logps/chosen": -644.2545776367188, "logps/rejected": -766.0762939453125, "loss": 0.8224, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.9169631004333496, "rewards/margins": 1.47373366355896, "rewards/rejected": -5.390697479248047, "step": 11330 }, { "epoch": 0.74, "grad_norm": 0.12109375, "learning_rate": 9.472140867500984e-07, "logits/chosen": -1.6985282897949219, "logits/rejected": -0.6034991145133972, "logps/chosen": -650.6131591796875, "logps/rejected": -878.9402465820312, "loss": 0.3597, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.6340534687042236, "rewards/margins": 2.892606019973755, "rewards/rejected": -6.526658535003662, "step": 11340 }, { "epoch": 0.74, "grad_norm": 9.25, "learning_rate": 9.427428453485573e-07, "logits/chosen": -1.4451093673706055, "logits/rejected": -1.2360419034957886, "logps/chosen": -662.2075805664062, "logps/rejected": -969.5867309570312, "loss": 0.3296, "rewards/accuracies": 0.875, "rewards/chosen": -3.9523608684539795, "rewards/margins": 2.6647050380706787, "rewards/rejected": -6.617066383361816, "step": 11350 }, { "epoch": 0.74, "grad_norm": 29.625, "learning_rate": 9.382797285444514e-07, "logits/chosen": -1.4039177894592285, "logits/rejected": -1.1079533100128174, "logps/chosen": -632.98583984375, "logps/rejected": -733.6910400390625, "loss": 0.6763, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.709031343460083, "rewards/margins": 1.2059314250946045, "rewards/rejected": -4.914963245391846, "step": 11360 }, { "epoch": 0.74, "grad_norm": 21.625, "learning_rate": 9.33824759622971e-07, "logits/chosen": -1.2493447065353394, "logits/rejected": -1.0052391290664673, "logps/chosen": -547.8887939453125, "logps/rejected": -770.4210815429688, "loss": 0.4788, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.7086639404296875, "rewards/margins": 1.6633679866790771, "rewards/rejected": -5.372032165527344, "step": 11370 }, { "epoch": 0.74, "grad_norm": 23.625, "learning_rate": 9.293779618267957e-07, "logits/chosen": -1.3770257234573364, "logits/rejected": -0.5524855852127075, "logps/chosen": -615.2626342773438, "logps/rejected": -676.6341552734375, "loss": 0.7283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.901782274246216, "rewards/margins": 1.0626537799835205, "rewards/rejected": -4.9644365310668945, "step": 11380 }, { "epoch": 0.75, "grad_norm": 43.25, "learning_rate": 9.249393583559737e-07, "logits/chosen": -1.4542347192764282, "logits/rejected": -1.1675550937652588, "logps/chosen": -556.9129638671875, "logps/rejected": -738.1154174804688, "loss": 0.602, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.625152587890625, "rewards/margins": 1.6974029541015625, "rewards/rejected": -5.322554588317871, "step": 11390 }, { "epoch": 0.75, "grad_norm": 31.125, "learning_rate": 9.205089723678026e-07, "logits/chosen": -1.420475959777832, "logits/rejected": -1.2236034870147705, "logps/chosen": -593.2994384765625, "logps/rejected": -708.1497802734375, "loss": 0.6254, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.9391849040985107, "rewards/margins": 1.1734837293624878, "rewards/rejected": -5.112668514251709, "step": 11400 }, { "epoch": 0.75, "eval_logits/chosen": -1.5401312112808228, "eval_logits/rejected": -1.177199125289917, "eval_logps/chosen": -651.9423217773438, "eval_logps/rejected": -808.842041015625, "eval_loss": 0.5159971117973328, "eval_rewards/accuracies": 0.7509999871253967, "eval_rewards/chosen": -3.873227834701538, "eval_rewards/margins": 1.769476294517517, "eval_rewards/rejected": -5.642704010009766, "eval_runtime": 1082.2934, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 11400 }, { "epoch": 0.75, "grad_norm": 44.5, "learning_rate": 9.160868269767073e-07, "logits/chosen": -1.2842638492584229, "logits/rejected": -1.179369330406189, "logps/chosen": -627.15087890625, "logps/rejected": -857.9449462890625, "loss": 0.5905, "rewards/accuracies": 0.75, "rewards/chosen": -3.8428313732147217, "rewards/margins": 2.141211748123169, "rewards/rejected": -5.984044075012207, "step": 11410 }, { "epoch": 0.75, "grad_norm": 29.5, "learning_rate": 9.116729452541171e-07, "logits/chosen": -1.4884748458862305, "logits/rejected": -0.7454864382743835, "logps/chosen": -659.6254272460938, "logps/rejected": -879.5703125, "loss": 0.4792, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.161078929901123, "rewards/margins": 2.3977062702178955, "rewards/rejected": -6.558785438537598, "step": 11420 }, { "epoch": 0.75, "grad_norm": 28.375, "learning_rate": 9.072673502283505e-07, "logits/chosen": -1.7381538152694702, "logits/rejected": -1.2404654026031494, "logps/chosen": -640.6629638671875, "logps/rejected": -766.8131103515625, "loss": 0.4407, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.6156201362609863, "rewards/margins": 1.6657425165176392, "rewards/rejected": -5.281362533569336, "step": 11430 }, { "epoch": 0.75, "grad_norm": 26.25, "learning_rate": 9.028700648844913e-07, "logits/chosen": -1.3848060369491577, "logits/rejected": -1.1221176385879517, "logps/chosen": -631.39501953125, "logps/rejected": -823.2238159179688, "loss": 0.5507, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.6594557762145996, "rewards/margins": 2.135200262069702, "rewards/rejected": -5.794655799865723, "step": 11440 }, { "epoch": 0.75, "grad_norm": 4.96875, "learning_rate": 8.984811121642698e-07, "logits/chosen": -1.3590683937072754, "logits/rejected": -1.3658292293548584, "logps/chosen": -649.47607421875, "logps/rejected": -746.535400390625, "loss": 0.6184, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.800395965576172, "rewards/margins": 1.780416488647461, "rewards/rejected": -5.580812454223633, "step": 11450 }, { "epoch": 0.75, "grad_norm": 41.75, "learning_rate": 8.941005149659418e-07, "logits/chosen": -1.4518911838531494, "logits/rejected": -1.2264163494110107, "logps/chosen": -618.6221313476562, "logps/rejected": -739.60791015625, "loss": 0.5102, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3561244010925293, "rewards/margins": 1.7630319595336914, "rewards/rejected": -5.119156360626221, "step": 11460 }, { "epoch": 0.75, "grad_norm": 5.28125, "learning_rate": 8.897282961441714e-07, "logits/chosen": -1.8710002899169922, "logits/rejected": -1.453700304031372, "logps/chosen": -696.6652221679688, "logps/rejected": -818.4342041015625, "loss": 0.4676, "rewards/accuracies": 0.75, "rewards/chosen": -3.8944854736328125, "rewards/margins": 1.5582842826843262, "rewards/rejected": -5.452770709991455, "step": 11470 }, { "epoch": 0.75, "grad_norm": 11.9375, "learning_rate": 8.853644785099113e-07, "logits/chosen": -1.419798493385315, "logits/rejected": -0.2011881321668625, "logps/chosen": -636.9826049804688, "logps/rejected": -857.1910400390625, "loss": 0.3923, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.102320671081543, "rewards/margins": 2.471985101699829, "rewards/rejected": -6.574305534362793, "step": 11480 }, { "epoch": 0.75, "grad_norm": 13.8125, "learning_rate": 8.810090848302796e-07, "logits/chosen": -1.6194489002227783, "logits/rejected": -1.4365675449371338, "logps/chosen": -684.3155517578125, "logps/rejected": -854.6485595703125, "loss": 0.4724, "rewards/accuracies": 0.75, "rewards/chosen": -3.711862087249756, "rewards/margins": 2.1244473457336426, "rewards/rejected": -5.836309909820557, "step": 11490 }, { "epoch": 0.75, "grad_norm": 2.59375, "learning_rate": 8.766621378284499e-07, "logits/chosen": -1.7274757623672485, "logits/rejected": -1.1776467561721802, "logps/chosen": -693.0520629882812, "logps/rejected": -823.9361572265625, "loss": 0.5912, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.232132911682129, "rewards/margins": 1.4059382677078247, "rewards/rejected": -5.638071060180664, "step": 11500 }, { "epoch": 0.75, "eval_logits/chosen": -1.5291798114776611, "eval_logits/rejected": -1.1611979007720947, "eval_logps/chosen": -657.7830200195312, "eval_logps/rejected": -816.4194946289062, "eval_loss": 0.517346203327179, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -3.9316344261169434, "eval_rewards/margins": 1.7868434190750122, "eval_rewards/rejected": -5.718478202819824, "eval_runtime": 1082.1983, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 11500 }, { "epoch": 0.75, "grad_norm": 15.0, "learning_rate": 8.723236601835245e-07, "logits/chosen": -1.401452660560608, "logits/rejected": -1.1946027278900146, "logps/chosen": -655.5980224609375, "logps/rejected": -861.2353515625, "loss": 0.2913, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.9774937629699707, "rewards/margins": 2.2400383949279785, "rewards/rejected": -6.217532157897949, "step": 11510 }, { "epoch": 0.75, "grad_norm": 7.90625, "learning_rate": 8.679936745304171e-07, "logits/chosen": -1.4947093725204468, "logits/rejected": -1.0579376220703125, "logps/chosen": -677.4515991210938, "logps/rejected": -792.8468627929688, "loss": 0.4404, "rewards/accuracies": 0.875, "rewards/chosen": -3.9890122413635254, "rewards/margins": 1.685025930404663, "rewards/rejected": -5.674038887023926, "step": 11520 }, { "epoch": 0.75, "grad_norm": 19.25, "learning_rate": 8.636722034597397e-07, "logits/chosen": -1.0981495380401611, "logits/rejected": -0.9629520177841187, "logps/chosen": -626.1697998046875, "logps/rejected": -755.2869262695312, "loss": 0.5464, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6339874267578125, "rewards/margins": 1.5072648525238037, "rewards/rejected": -5.141252040863037, "step": 11530 }, { "epoch": 0.76, "grad_norm": 28.5, "learning_rate": 8.593592695176814e-07, "logits/chosen": -1.0892744064331055, "logits/rejected": -0.5507336854934692, "logps/chosen": -655.3826293945312, "logps/rejected": -903.8552856445312, "loss": 0.473, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.2122039794921875, "rewards/margins": 2.3466999530792236, "rewards/rejected": -6.55890417098999, "step": 11540 }, { "epoch": 0.76, "grad_norm": 11.25, "learning_rate": 8.550548952058879e-07, "logits/chosen": -1.2217892408370972, "logits/rejected": -1.0100353956222534, "logps/chosen": -623.0684814453125, "logps/rejected": -890.4901123046875, "loss": 0.609, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.956874370574951, "rewards/margins": 2.4463486671447754, "rewards/rejected": -6.403222560882568, "step": 11550 }, { "epoch": 0.76, "grad_norm": 8.875, "learning_rate": 8.507591029813503e-07, "logits/chosen": -1.7788989543914795, "logits/rejected": -1.1032557487487793, "logps/chosen": -587.6117553710938, "logps/rejected": -800.135498046875, "loss": 0.3379, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.646329879760742, "rewards/margins": 2.3320202827453613, "rewards/rejected": -5.9783501625061035, "step": 11560 }, { "epoch": 0.76, "grad_norm": 13.125, "learning_rate": 8.464719152562845e-07, "logits/chosen": -1.4351698160171509, "logits/rejected": -1.2369781732559204, "logps/chosen": -680.2758178710938, "logps/rejected": -822.55517578125, "loss": 0.6137, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.009663105010986, "rewards/margins": 1.861148476600647, "rewards/rejected": -5.870810508728027, "step": 11570 }, { "epoch": 0.76, "grad_norm": 66.0, "learning_rate": 8.421933543980126e-07, "logits/chosen": -1.574904441833496, "logits/rejected": -1.3291524648666382, "logps/chosen": -663.0416259765625, "logps/rejected": -785.6683959960938, "loss": 0.5252, "rewards/accuracies": 0.75, "rewards/chosen": -3.859210968017578, "rewards/margins": 1.9046485424041748, "rewards/rejected": -5.763859748840332, "step": 11580 }, { "epoch": 0.76, "grad_norm": 26.25, "learning_rate": 8.379234427288499e-07, "logits/chosen": -1.6265981197357178, "logits/rejected": -1.41708242893219, "logps/chosen": -719.2433471679688, "logps/rejected": -826.5040893554688, "loss": 0.532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.126597881317139, "rewards/margins": 1.5951210260391235, "rewards/rejected": -5.721718788146973, "step": 11590 }, { "epoch": 0.76, "grad_norm": 27.875, "learning_rate": 8.336622025259869e-07, "logits/chosen": -1.5296138525009155, "logits/rejected": -1.269536018371582, "logps/chosen": -700.52783203125, "logps/rejected": -788.9874877929688, "loss": 0.5279, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.083573341369629, "rewards/margins": 1.6254247426986694, "rewards/rejected": -5.708998203277588, "step": 11600 }, { "epoch": 0.76, "eval_logits/chosen": -1.4904841184616089, "eval_logits/rejected": -1.1124804019927979, "eval_logps/chosen": -677.7863159179688, "eval_logps/rejected": -843.0165405273438, "eval_loss": 0.5231361985206604, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -4.131667137145996, "eval_rewards/margins": 1.8527817726135254, "eval_rewards/rejected": -5.98444938659668, "eval_runtime": 1082.1644, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 11600 }, { "epoch": 0.76, "grad_norm": 7.84375, "learning_rate": 8.294096560213718e-07, "logits/chosen": -1.780577301979065, "logits/rejected": -0.8992765545845032, "logps/chosen": -685.2669677734375, "logps/rejected": -855.9033203125, "loss": 0.4583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.845677137374878, "rewards/margins": 2.104475736618042, "rewards/rejected": -5.950152397155762, "step": 11610 }, { "epoch": 0.76, "grad_norm": 74.0, "learning_rate": 8.251658254015965e-07, "logits/chosen": -1.4227510690689087, "logits/rejected": -0.978543758392334, "logps/chosen": -656.0125122070312, "logps/rejected": -801.9383544921875, "loss": 0.7133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.2729010581970215, "rewards/margins": 1.5784087181091309, "rewards/rejected": -5.851309776306152, "step": 11620 }, { "epoch": 0.76, "grad_norm": 71.0, "learning_rate": 8.209307328077801e-07, "logits/chosen": -1.567085862159729, "logits/rejected": -0.9493902325630188, "logps/chosen": -720.2786254882812, "logps/rejected": -907.5556640625, "loss": 0.5445, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.436817169189453, "rewards/margins": 2.097930431365967, "rewards/rejected": -6.534747123718262, "step": 11630 }, { "epoch": 0.76, "grad_norm": 36.25, "learning_rate": 8.167044003354513e-07, "logits/chosen": -1.628775954246521, "logits/rejected": -0.9060964584350586, "logps/chosen": -680.4591064453125, "logps/rejected": -909.6395263671875, "loss": 0.6038, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.479031085968018, "rewards/margins": 2.0956978797912598, "rewards/rejected": -6.574728488922119, "step": 11640 }, { "epoch": 0.76, "grad_norm": 36.5, "learning_rate": 8.124868500344374e-07, "logits/chosen": -1.638683557510376, "logits/rejected": -1.434559941291809, "logps/chosen": -612.7188720703125, "logps/rejected": -805.5785522460938, "loss": 0.6596, "rewards/accuracies": 0.75, "rewards/chosen": -3.767033815383911, "rewards/margins": 1.4541517496109009, "rewards/rejected": -5.221185684204102, "step": 11650 }, { "epoch": 0.76, "grad_norm": 9.875, "learning_rate": 8.08278103908747e-07, "logits/chosen": -1.703437089920044, "logits/rejected": -0.5194367170333862, "logps/chosen": -703.4808349609375, "logps/rejected": -898.1657104492188, "loss": 0.4192, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.210177421569824, "rewards/margins": 1.9958454370498657, "rewards/rejected": -6.206023216247559, "step": 11660 }, { "epoch": 0.76, "grad_norm": 35.5, "learning_rate": 8.040781839164522e-07, "logits/chosen": -1.4224512577056885, "logits/rejected": -0.7236167788505554, "logps/chosen": -701.8591918945312, "logps/rejected": -847.6331787109375, "loss": 0.5314, "rewards/accuracies": 0.75, "rewards/chosen": -4.245940685272217, "rewards/margins": 1.834656000137329, "rewards/rejected": -6.080596923828125, "step": 11670 }, { "epoch": 0.76, "grad_norm": 24.0, "learning_rate": 7.9988711196958e-07, "logits/chosen": -1.4947714805603027, "logits/rejected": -0.8905059099197388, "logps/chosen": -583.1505126953125, "logps/rejected": -828.5579223632812, "loss": 0.3587, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8961100578308105, "rewards/margins": 2.610759735107422, "rewards/rejected": -6.506869316101074, "step": 11680 }, { "epoch": 0.76, "grad_norm": 24.375, "learning_rate": 7.957049099339939e-07, "logits/chosen": -1.5780426263809204, "logits/rejected": -1.0495127439498901, "logps/chosen": -636.3817138671875, "logps/rejected": -822.10986328125, "loss": 0.4957, "rewards/accuracies": 0.75, "rewards/chosen": -3.773613452911377, "rewards/margins": 2.0791642665863037, "rewards/rejected": -5.852778434753418, "step": 11690 }, { "epoch": 0.77, "grad_norm": 60.0, "learning_rate": 7.915315996292817e-07, "logits/chosen": -1.3479483127593994, "logits/rejected": -1.00907301902771, "logps/chosen": -691.164794921875, "logps/rejected": -804.6561889648438, "loss": 0.5654, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.313303470611572, "rewards/margins": 1.691542387008667, "rewards/rejected": -6.00484561920166, "step": 11700 }, { "epoch": 0.77, "eval_logits/chosen": -1.5063122510910034, "eval_logits/rejected": -1.1325358152389526, "eval_logps/chosen": -674.6688842773438, "eval_logps/rejected": -838.8231201171875, "eval_loss": 0.523455798625946, "eval_rewards/accuracies": 0.7450000047683716, "eval_rewards/chosen": -4.100492477416992, "eval_rewards/margins": 1.8420226573944092, "eval_rewards/rejected": -5.942515850067139, "eval_runtime": 1085.9427, "eval_samples_per_second": 1.842, "eval_steps_per_second": 1.842, "step": 11700 }, { "epoch": 0.77, "grad_norm": 10.875, "learning_rate": 7.873672028286375e-07, "logits/chosen": -1.3153307437896729, "logits/rejected": -1.098110556602478, "logps/chosen": -663.5138549804688, "logps/rejected": -848.38134765625, "loss": 0.4603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7647461891174316, "rewards/margins": 2.2859947681427, "rewards/rejected": -6.050740718841553, "step": 11710 }, { "epoch": 0.77, "grad_norm": 20.125, "learning_rate": 7.832117412587572e-07, "logits/chosen": -1.5956151485443115, "logits/rejected": -1.2304719686508179, "logps/chosen": -597.2197265625, "logps/rejected": -795.50390625, "loss": 0.4935, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6629326343536377, "rewards/margins": 1.9042421579360962, "rewards/rejected": -5.567174434661865, "step": 11720 }, { "epoch": 0.77, "grad_norm": 18.5, "learning_rate": 7.790652365997137e-07, "logits/chosen": -1.542378306388855, "logits/rejected": -0.8948489427566528, "logps/chosen": -724.7398681640625, "logps/rejected": -898.3888549804688, "loss": 0.3766, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.27140998840332, "rewards/margins": 2.2751266956329346, "rewards/rejected": -6.546536445617676, "step": 11730 }, { "epoch": 0.77, "grad_norm": 19.5, "learning_rate": 7.749277104848529e-07, "logits/chosen": -1.6090641021728516, "logits/rejected": -1.318196177482605, "logps/chosen": -654.6233520507812, "logps/rejected": -771.0850830078125, "loss": 0.678, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.788322925567627, "rewards/margins": 1.4806817770004272, "rewards/rejected": -5.269004821777344, "step": 11740 }, { "epoch": 0.77, "grad_norm": 10.125, "learning_rate": 7.707991845006768e-07, "logits/chosen": -1.7783253192901611, "logits/rejected": -1.4519462585449219, "logps/chosen": -693.9760131835938, "logps/rejected": -802.0398559570312, "loss": 0.482, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.814502716064453, "rewards/margins": 1.7175853252410889, "rewards/rejected": -5.532088279724121, "step": 11750 }, { "epoch": 0.77, "grad_norm": 11.375, "learning_rate": 7.666796801867299e-07, "logits/chosen": -1.4650354385375977, "logits/rejected": -0.8849795460700989, "logps/chosen": -665.6531372070312, "logps/rejected": -896.9962768554688, "loss": 0.2986, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.721773862838745, "rewards/margins": 2.6028265953063965, "rewards/rejected": -6.3246002197265625, "step": 11760 }, { "epoch": 0.77, "grad_norm": 26.125, "learning_rate": 7.625692190354902e-07, "logits/chosen": -1.5395559072494507, "logits/rejected": -0.9380024075508118, "logps/chosen": -669.8912963867188, "logps/rejected": -817.3536987304688, "loss": 0.313, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.8047969341278076, "rewards/margins": 2.125767707824707, "rewards/rejected": -5.930564880371094, "step": 11770 }, { "epoch": 0.77, "grad_norm": 11.125, "learning_rate": 7.584678224922548e-07, "logits/chosen": -1.2558648586273193, "logits/rejected": -0.9297658205032349, "logps/chosen": -647.3333740234375, "logps/rejected": -872.5603637695312, "loss": 0.3926, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.133570671081543, "rewards/margins": 2.2376017570495605, "rewards/rejected": -6.3711724281311035, "step": 11780 }, { "epoch": 0.77, "grad_norm": 24.875, "learning_rate": 7.543755119550289e-07, "logits/chosen": -1.6013009548187256, "logits/rejected": -1.2641499042510986, "logps/chosen": -673.2734375, "logps/rejected": -819.8677978515625, "loss": 0.4643, "rewards/accuracies": 0.75, "rewards/chosen": -4.026200294494629, "rewards/margins": 1.5969005823135376, "rewards/rejected": -5.623100757598877, "step": 11790 }, { "epoch": 0.77, "grad_norm": 23.375, "learning_rate": 7.502923087744118e-07, "logits/chosen": -1.6481777429580688, "logits/rejected": -1.1611673831939697, "logps/chosen": -692.1486206054688, "logps/rejected": -854.4332275390625, "loss": 0.6573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.265809059143066, "rewards/margins": 1.6619268655776978, "rewards/rejected": -5.927735328674316, "step": 11800 }, { "epoch": 0.77, "eval_logits/chosen": -1.500537395477295, "eval_logits/rejected": -1.1285433769226074, "eval_logps/chosen": -678.0628662109375, "eval_logps/rejected": -842.6799926757812, "eval_loss": 0.5227764844894409, "eval_rewards/accuracies": 0.7455000281333923, "eval_rewards/chosen": -4.134433269500732, "eval_rewards/margins": 1.8466509580612183, "eval_rewards/rejected": -5.981083869934082, "eval_runtime": 1082.1456, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 11800 }, { "epoch": 0.77, "grad_norm": 52.5, "learning_rate": 7.462182342534896e-07, "logits/chosen": -1.3030363321304321, "logits/rejected": -1.1278133392333984, "logps/chosen": -729.248291015625, "logps/rejected": -827.5103759765625, "loss": 0.4919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.3693928718566895, "rewards/margins": 1.7341487407684326, "rewards/rejected": -6.103541374206543, "step": 11810 }, { "epoch": 0.77, "grad_norm": 11.3125, "learning_rate": 7.42153309647721e-07, "logits/chosen": -1.8117586374282837, "logits/rejected": -1.3732068538665771, "logps/chosen": -619.8848876953125, "logps/rejected": -799.4303588867188, "loss": 0.3169, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.6352810859680176, "rewards/margins": 2.331641435623169, "rewards/rejected": -5.966922760009766, "step": 11820 }, { "epoch": 0.77, "grad_norm": 23.875, "learning_rate": 7.38097556164828e-07, "logits/chosen": -1.843809723854065, "logits/rejected": -1.5940641164779663, "logps/chosen": -610.4765014648438, "logps/rejected": -829.7083740234375, "loss": 0.4678, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.473951816558838, "rewards/margins": 2.0710220336914062, "rewards/rejected": -5.544973850250244, "step": 11830 }, { "epoch": 0.77, "grad_norm": 35.0, "learning_rate": 7.340509949646835e-07, "logits/chosen": -1.0151475667953491, "logits/rejected": -0.7792425155639648, "logps/chosen": -598.1163330078125, "logps/rejected": -830.2200317382812, "loss": 0.5053, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.336109161376953, "rewards/margins": 2.0048575401306152, "rewards/rejected": -6.340967655181885, "step": 11840 }, { "epoch": 0.78, "grad_norm": 42.0, "learning_rate": 7.300136471592034e-07, "logits/chosen": -1.8083137273788452, "logits/rejected": -1.2982746362686157, "logps/chosen": -648.6859130859375, "logps/rejected": -723.4273681640625, "loss": 0.6605, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.985044002532959, "rewards/margins": 1.071336030960083, "rewards/rejected": -5.056380271911621, "step": 11850 }, { "epoch": 0.78, "grad_norm": 46.75, "learning_rate": 7.259855338122332e-07, "logits/chosen": -1.6242583990097046, "logits/rejected": -1.1966116428375244, "logps/chosen": -652.71875, "logps/rejected": -762.2576293945312, "loss": 0.4717, "rewards/accuracies": 0.75, "rewards/chosen": -3.8997490406036377, "rewards/margins": 1.6256234645843506, "rewards/rejected": -5.525372505187988, "step": 11860 }, { "epoch": 0.78, "grad_norm": 18.125, "learning_rate": 7.219666759394409e-07, "logits/chosen": -1.4722648859024048, "logits/rejected": -0.6479132175445557, "logps/chosen": -696.8582153320312, "logps/rejected": -825.0279541015625, "loss": 0.436, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.085333824157715, "rewards/margins": 2.002523422241211, "rewards/rejected": -6.087857246398926, "step": 11870 }, { "epoch": 0.78, "grad_norm": 24.375, "learning_rate": 7.179570945082079e-07, "logits/chosen": -1.6112852096557617, "logits/rejected": -1.2399073839187622, "logps/chosen": -703.17919921875, "logps/rejected": -831.794921875, "loss": 0.4449, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.128902435302734, "rewards/margins": 1.744604468345642, "rewards/rejected": -5.873507022857666, "step": 11880 }, { "epoch": 0.78, "grad_norm": 12.125, "learning_rate": 7.139568104375144e-07, "logits/chosen": -1.228761076927185, "logits/rejected": -0.978671669960022, "logps/chosen": -601.1947021484375, "logps/rejected": -818.465576171875, "loss": 0.5145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.572385311126709, "rewards/margins": 2.274453639984131, "rewards/rejected": -5.846838474273682, "step": 11890 }, { "epoch": 0.78, "grad_norm": 41.75, "learning_rate": 7.099658445978372e-07, "logits/chosen": -1.6251037120819092, "logits/rejected": -0.9895383715629578, "logps/chosen": -696.36572265625, "logps/rejected": -842.365234375, "loss": 0.4045, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.9776854515075684, "rewards/margins": 1.9677213430404663, "rewards/rejected": -5.945406913757324, "step": 11900 }, { "epoch": 0.78, "eval_logits/chosen": -1.4978458881378174, "eval_logits/rejected": -1.1270947456359863, "eval_logps/chosen": -680.6879272460938, "eval_logps/rejected": -844.8413696289062, "eval_loss": 0.5222220420837402, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -4.160682678222656, "eval_rewards/margins": 1.8420147895812988, "eval_rewards/rejected": -6.002697467803955, "eval_runtime": 1082.1817, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 11900 }, { "epoch": 0.78, "grad_norm": 10.875, "learning_rate": 7.059842178110377e-07, "logits/chosen": -1.7087364196777344, "logits/rejected": -1.21110200881958, "logps/chosen": -719.2329711914062, "logps/rejected": -873.29638671875, "loss": 0.4667, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.8367958068847656, "rewards/margins": 1.8783191442489624, "rewards/rejected": -5.715115070343018, "step": 11910 }, { "epoch": 0.78, "grad_norm": 21.75, "learning_rate": 7.020119508502498e-07, "logits/chosen": -1.218531608581543, "logits/rejected": -1.0275070667266846, "logps/chosen": -696.7399291992188, "logps/rejected": -831.6785278320312, "loss": 0.5384, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.023758411407471, "rewards/margins": 1.7635014057159424, "rewards/rejected": -5.787259578704834, "step": 11920 }, { "epoch": 0.78, "grad_norm": 16.125, "learning_rate": 6.980490644397772e-07, "logits/chosen": -1.3381099700927734, "logits/rejected": -0.9582138061523438, "logps/chosen": -682.9896240234375, "logps/rejected": -862.9490966796875, "loss": 0.5458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.208303451538086, "rewards/margins": 2.23018741607666, "rewards/rejected": -6.438490867614746, "step": 11930 }, { "epoch": 0.78, "grad_norm": 46.75, "learning_rate": 6.940955792549855e-07, "logits/chosen": -1.8159472942352295, "logits/rejected": -1.471983790397644, "logps/chosen": -674.5723876953125, "logps/rejected": -855.2589111328125, "loss": 0.5656, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.7438106536865234, "rewards/margins": 1.688943862915039, "rewards/rejected": -5.4327545166015625, "step": 11940 }, { "epoch": 0.78, "grad_norm": 24.125, "learning_rate": 6.901515159221863e-07, "logits/chosen": -1.595873236656189, "logits/rejected": -1.6501624584197998, "logps/chosen": -694.7517700195312, "logps/rejected": -834.9244384765625, "loss": 0.6077, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.9564285278320312, "rewards/margins": 1.671661615371704, "rewards/rejected": -5.628089427947998, "step": 11950 }, { "epoch": 0.78, "grad_norm": 32.0, "learning_rate": 6.862168950185389e-07, "logits/chosen": -1.6871302127838135, "logits/rejected": -1.5468018054962158, "logps/chosen": -725.8421020507812, "logps/rejected": -784.5599365234375, "loss": 0.6214, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.266391754150391, "rewards/margins": 1.3449256420135498, "rewards/rejected": -5.6113176345825195, "step": 11960 }, { "epoch": 0.78, "grad_norm": 50.0, "learning_rate": 6.822917370719387e-07, "logits/chosen": -1.497341513633728, "logits/rejected": -0.3917994201183319, "logps/chosen": -738.779052734375, "logps/rejected": -846.099609375, "loss": 0.5744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.635067939758301, "rewards/margins": 1.5919008255004883, "rewards/rejected": -6.226968288421631, "step": 11970 }, { "epoch": 0.78, "grad_norm": 18.25, "learning_rate": 6.783760625609079e-07, "logits/chosen": -1.6000808477401733, "logits/rejected": -0.766375720500946, "logps/chosen": -677.2998046875, "logps/rejected": -776.2774658203125, "loss": 0.6101, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9846222400665283, "rewards/margins": 1.700716257095337, "rewards/rejected": -5.685338020324707, "step": 11980 }, { "epoch": 0.78, "grad_norm": 66.0, "learning_rate": 6.744698919144943e-07, "logits/chosen": -1.3956226110458374, "logits/rejected": -1.4166101217269897, "logps/chosen": -764.3978271484375, "logps/rejected": -830.7586669921875, "loss": 0.7016, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.510998725891113, "rewards/margins": 1.1108975410461426, "rewards/rejected": -5.621896266937256, "step": 11990 }, { "epoch": 0.79, "grad_norm": 42.0, "learning_rate": 6.705732455121614e-07, "logits/chosen": -1.4741584062576294, "logits/rejected": -1.431957721710205, "logps/chosen": -685.6370849609375, "logps/rejected": -869.1770629882812, "loss": 0.436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.1467790603637695, "rewards/margins": 2.1343014240264893, "rewards/rejected": -6.281080722808838, "step": 12000 }, { "epoch": 0.79, "eval_logits/chosen": -1.5060962438583374, "eval_logits/rejected": -1.1402665376663208, "eval_logps/chosen": -676.4965209960938, "eval_logps/rejected": -837.9908447265625, "eval_loss": 0.5193271040916443, "eval_rewards/accuracies": 0.7455000281333923, "eval_rewards/chosen": -4.11876916885376, "eval_rewards/margins": 1.8154231309890747, "eval_rewards/rejected": -5.934192180633545, "eval_runtime": 1082.157, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 12000 }, { "epoch": 0.79, "grad_norm": 8.6875, "learning_rate": 6.666861436836797e-07, "logits/chosen": -1.688550591468811, "logits/rejected": -1.0791298151016235, "logps/chosen": -593.9473266601562, "logps/rejected": -852.4251708984375, "loss": 0.3962, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.5187058448791504, "rewards/margins": 2.625777006149292, "rewards/rejected": -6.144482612609863, "step": 12010 }, { "epoch": 0.79, "grad_norm": 33.25, "learning_rate": 6.628086067090259e-07, "logits/chosen": -1.730746865272522, "logits/rejected": -1.0945309400558472, "logps/chosen": -642.9513549804688, "logps/rejected": -848.9357299804688, "loss": 0.6345, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.103895664215088, "rewards/margins": 1.792426347732544, "rewards/rejected": -5.8963212966918945, "step": 12020 }, { "epoch": 0.79, "grad_norm": 12.875, "learning_rate": 6.589406548182739e-07, "logits/chosen": -1.407861590385437, "logits/rejected": -1.2673335075378418, "logps/chosen": -754.3253784179688, "logps/rejected": -815.3931274414062, "loss": 0.6125, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.47873592376709, "rewards/margins": 1.3881124258041382, "rewards/rejected": -5.866847991943359, "step": 12030 }, { "epoch": 0.79, "grad_norm": 46.25, "learning_rate": 6.550823081914892e-07, "logits/chosen": -1.537139654159546, "logits/rejected": -1.4469596147537231, "logps/chosen": -692.9555053710938, "logps/rejected": -735.6586303710938, "loss": 0.6235, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.960869550704956, "rewards/margins": 1.1308437585830688, "rewards/rejected": -5.091713905334473, "step": 12040 }, { "epoch": 0.79, "grad_norm": 35.0, "learning_rate": 6.512335869586253e-07, "logits/chosen": -1.5150340795516968, "logits/rejected": -1.2375071048736572, "logps/chosen": -677.3089599609375, "logps/rejected": -833.4364013671875, "loss": 0.4736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7448344230651855, "rewards/margins": 1.587040662765503, "rewards/rejected": -5.331874847412109, "step": 12050 }, { "epoch": 0.79, "grad_norm": 81.0, "learning_rate": 6.47394511199417e-07, "logits/chosen": -1.2394012212753296, "logits/rejected": -1.015906572341919, "logps/chosen": -621.1302490234375, "logps/rejected": -811.2617797851562, "loss": 0.5447, "rewards/accuracies": 0.75, "rewards/chosen": -4.064301490783691, "rewards/margins": 1.7325935363769531, "rewards/rejected": -5.796894073486328, "step": 12060 }, { "epoch": 0.79, "grad_norm": 34.5, "learning_rate": 6.435651009432753e-07, "logits/chosen": -1.2864211797714233, "logits/rejected": -0.7344447374343872, "logps/chosen": -759.8924560546875, "logps/rejected": -881.9423828125, "loss": 0.4807, "rewards/accuracies": 0.75, "rewards/chosen": -4.4373674392700195, "rewards/margins": 1.6608721017837524, "rewards/rejected": -6.098239421844482, "step": 12070 }, { "epoch": 0.79, "grad_norm": 15.0, "learning_rate": 6.397453761691855e-07, "logits/chosen": -1.4392045736312866, "logits/rejected": -1.5878310203552246, "logps/chosen": -614.2974853515625, "logps/rejected": -748.1862182617188, "loss": 0.6289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7002017498016357, "rewards/margins": 1.1875829696655273, "rewards/rejected": -4.887784957885742, "step": 12080 }, { "epoch": 0.79, "grad_norm": 56.25, "learning_rate": 6.359353568056001e-07, "logits/chosen": -1.5289428234100342, "logits/rejected": -1.466713547706604, "logps/chosen": -725.5125732421875, "logps/rejected": -927.7677612304688, "loss": 0.6665, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.550330638885498, "rewards/margins": 1.7705971002578735, "rewards/rejected": -6.320927619934082, "step": 12090 }, { "epoch": 0.79, "grad_norm": 60.75, "learning_rate": 6.321350627303383e-07, "logits/chosen": -1.628326416015625, "logits/rejected": -1.5627050399780273, "logps/chosen": -663.7351684570312, "logps/rejected": -817.0255737304688, "loss": 0.519, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.006964206695557, "rewards/margins": 1.5597261190414429, "rewards/rejected": -5.566690444946289, "step": 12100 }, { "epoch": 0.79, "eval_logits/chosen": -1.518898367881775, "eval_logits/rejected": -1.155161738395691, "eval_logps/chosen": -666.9061889648438, "eval_logps/rejected": -825.2211303710938, "eval_loss": 0.516356348991394, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -4.0228657722473145, "eval_rewards/margins": 1.7836289405822754, "eval_rewards/rejected": -5.806495189666748, "eval_runtime": 1082.6006, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 12100 }, { "epoch": 0.79, "grad_norm": 28.75, "learning_rate": 6.283445137704761e-07, "logits/chosen": -1.3784784078598022, "logits/rejected": -1.3448007106781006, "logps/chosen": -663.7547607421875, "logps/rejected": -798.6753540039062, "loss": 0.4399, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.209786415100098, "rewards/margins": 1.691099762916565, "rewards/rejected": -5.900887489318848, "step": 12110 }, { "epoch": 0.79, "grad_norm": 18.875, "learning_rate": 6.2456372970225e-07, "logits/chosen": -1.3183958530426025, "logits/rejected": -0.8962675929069519, "logps/chosen": -653.38818359375, "logps/rejected": -790.5770263671875, "loss": 0.4954, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.9129281044006348, "rewards/margins": 1.8729385137557983, "rewards/rejected": -5.785865783691406, "step": 12120 }, { "epoch": 0.79, "grad_norm": 12.0625, "learning_rate": 6.207927302509509e-07, "logits/chosen": -1.47410249710083, "logits/rejected": -1.4786646366119385, "logps/chosen": -590.216796875, "logps/rejected": -707.4972534179688, "loss": 0.5748, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7680563926696777, "rewards/margins": 1.4441457986831665, "rewards/rejected": -5.212202548980713, "step": 12130 }, { "epoch": 0.79, "grad_norm": 30.0, "learning_rate": 6.170315350908176e-07, "logits/chosen": -1.64339280128479, "logits/rejected": -1.330707311630249, "logps/chosen": -631.884521484375, "logps/rejected": -772.6292114257812, "loss": 0.4751, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.639812469482422, "rewards/margins": 1.9739468097686768, "rewards/rejected": -5.6137590408325195, "step": 12140 }, { "epoch": 0.79, "grad_norm": 31.375, "learning_rate": 6.132801638449409e-07, "logits/chosen": -1.57778000831604, "logits/rejected": -1.317490816116333, "logps/chosen": -679.09375, "logps/rejected": -795.4837646484375, "loss": 0.6662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.18748140335083, "rewards/margins": 1.1778626441955566, "rewards/rejected": -5.365344047546387, "step": 12150 }, { "epoch": 0.8, "grad_norm": 56.0, "learning_rate": 6.095386360851566e-07, "logits/chosen": -1.7877956628799438, "logits/rejected": -1.4785343408584595, "logps/chosen": -644.0350341796875, "logps/rejected": -853.9937744140625, "loss": 0.4212, "rewards/accuracies": 0.75, "rewards/chosen": -3.5087997913360596, "rewards/margins": 2.053851366043091, "rewards/rejected": -5.56265115737915, "step": 12160 }, { "epoch": 0.8, "grad_norm": 19.625, "learning_rate": 6.058069713319439e-07, "logits/chosen": -1.390193223953247, "logits/rejected": -1.1522866487503052, "logps/chosen": -659.2498779296875, "logps/rejected": -785.1752319335938, "loss": 0.662, "rewards/accuracies": 0.75, "rewards/chosen": -4.273807525634766, "rewards/margins": 1.399375319480896, "rewards/rejected": -5.673183441162109, "step": 12170 }, { "epoch": 0.8, "grad_norm": 20.0, "learning_rate": 6.020851890543253e-07, "logits/chosen": -1.267600417137146, "logits/rejected": -1.049772024154663, "logps/chosen": -602.1968383789062, "logps/rejected": -795.1844482421875, "loss": 0.5892, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8921866416931152, "rewards/margins": 1.5968676805496216, "rewards/rejected": -5.4890546798706055, "step": 12180 }, { "epoch": 0.8, "grad_norm": 66.5, "learning_rate": 5.983733086697641e-07, "logits/chosen": -1.4135277271270752, "logits/rejected": -1.2078399658203125, "logps/chosen": -658.5962524414062, "logps/rejected": -866.0196533203125, "loss": 0.4302, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.977917194366455, "rewards/margins": 2.205653429031372, "rewards/rejected": -6.183570384979248, "step": 12190 }, { "epoch": 0.8, "grad_norm": 10.6875, "learning_rate": 5.94671349544061e-07, "logits/chosen": -1.1038484573364258, "logits/rejected": -1.1312044858932495, "logps/chosen": -665.5697021484375, "logps/rejected": -795.4265747070312, "loss": 0.5342, "rewards/accuracies": 0.75, "rewards/chosen": -4.127209663391113, "rewards/margins": 1.3009650707244873, "rewards/rejected": -5.4281744956970215, "step": 12200 }, { "epoch": 0.8, "eval_logits/chosen": -1.5231178998947144, "eval_logits/rejected": -1.1597187519073486, "eval_logps/chosen": -662.93994140625, "eval_logps/rejected": -821.230224609375, "eval_loss": 0.5154730081558228, "eval_rewards/accuracies": 0.7484999895095825, "eval_rewards/chosen": -3.983203649520874, "eval_rewards/margins": 1.7833832502365112, "eval_rewards/rejected": -5.7665863037109375, "eval_runtime": 1082.1312, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 12200 }, { "epoch": 0.8, "grad_norm": 16.5, "learning_rate": 5.909793309912571e-07, "logits/chosen": -1.648316740989685, "logits/rejected": -1.0195114612579346, "logps/chosen": -618.3221435546875, "logps/rejected": -781.4452514648438, "loss": 0.3482, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.7221946716308594, "rewards/margins": 1.9397013187408447, "rewards/rejected": -5.661895751953125, "step": 12210 }, { "epoch": 0.8, "grad_norm": 7.96875, "learning_rate": 5.87297272273531e-07, "logits/chosen": -1.8792152404785156, "logits/rejected": -1.4424282312393188, "logps/chosen": -689.66552734375, "logps/rejected": -845.8053588867188, "loss": 0.4578, "rewards/accuracies": 0.75, "rewards/chosen": -3.7710349559783936, "rewards/margins": 1.635009765625, "rewards/rejected": -5.406044960021973, "step": 12220 }, { "epoch": 0.8, "grad_norm": 8.4375, "learning_rate": 5.836251926010966e-07, "logits/chosen": -1.6835416555404663, "logits/rejected": -0.7428938746452332, "logps/chosen": -735.2413940429688, "logps/rejected": -839.2423706054688, "loss": 0.447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.058168888092041, "rewards/margins": 1.985741376876831, "rewards/rejected": -6.043910026550293, "step": 12230 }, { "epoch": 0.8, "grad_norm": 22.875, "learning_rate": 5.799631111321063e-07, "logits/chosen": -1.3358148336410522, "logits/rejected": -1.145738124847412, "logps/chosen": -563.4652709960938, "logps/rejected": -742.3917236328125, "loss": 0.3595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5353798866271973, "rewards/margins": 1.9116445779800415, "rewards/rejected": -5.447024345397949, "step": 12240 }, { "epoch": 0.8, "grad_norm": 43.0, "learning_rate": 5.763110469725489e-07, "logits/chosen": -1.5881760120391846, "logits/rejected": -1.1227892637252808, "logps/chosen": -666.4017333984375, "logps/rejected": -854.9514770507812, "loss": 0.4403, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.008008003234863, "rewards/margins": 1.831017255783081, "rewards/rejected": -5.839025497436523, "step": 12250 }, { "epoch": 0.8, "grad_norm": 6.46875, "learning_rate": 5.726690191761502e-07, "logits/chosen": -1.5599058866500854, "logits/rejected": -1.6166801452636719, "logps/chosen": -688.8355102539062, "logps/rejected": -921.2562255859375, "loss": 0.4948, "rewards/accuracies": 0.75, "rewards/chosen": -3.8233330249786377, "rewards/margins": 2.0503437519073486, "rewards/rejected": -5.873676300048828, "step": 12260 }, { "epoch": 0.8, "grad_norm": 44.5, "learning_rate": 5.690370467442743e-07, "logits/chosen": -1.4836223125457764, "logits/rejected": -1.299156904220581, "logps/chosen": -649.4371337890625, "logps/rejected": -796.7279052734375, "loss": 0.6198, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.299609184265137, "rewards/margins": 1.3893229961395264, "rewards/rejected": -5.688932418823242, "step": 12270 }, { "epoch": 0.8, "grad_norm": 8.75, "learning_rate": 5.654151486258244e-07, "logits/chosen": -1.5862648487091064, "logits/rejected": -0.6133139729499817, "logps/chosen": -632.583984375, "logps/rejected": -790.6829223632812, "loss": 0.6279, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.9248416423797607, "rewards/margins": 1.7242481708526611, "rewards/rejected": -5.649089813232422, "step": 12280 }, { "epoch": 0.8, "grad_norm": 22.375, "learning_rate": 5.618033437171408e-07, "logits/chosen": -1.9040164947509766, "logits/rejected": -1.3887450695037842, "logps/chosen": -647.3591918945312, "logps/rejected": -865.3876953125, "loss": 0.5215, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.689727783203125, "rewards/margins": 1.9814634323120117, "rewards/rejected": -5.671191215515137, "step": 12290 }, { "epoch": 0.8, "grad_norm": 27.25, "learning_rate": 5.582016508619084e-07, "logits/chosen": -1.269507884979248, "logits/rejected": -0.9271600842475891, "logps/chosen": -667.913330078125, "logps/rejected": -846.1954345703125, "loss": 0.3715, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.0297722816467285, "rewards/margins": 1.9773470163345337, "rewards/rejected": -6.007119178771973, "step": 12300 }, { "epoch": 0.8, "eval_logits/chosen": -1.5152314901351929, "eval_logits/rejected": -1.1525275707244873, "eval_logps/chosen": -667.1307373046875, "eval_logps/rejected": -827.5243530273438, "eval_loss": 0.5171339511871338, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -4.025111675262451, "eval_rewards/margins": 1.8044151067733765, "eval_rewards/rejected": -5.829527378082275, "eval_runtime": 1083.0933, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 12300 }, { "epoch": 0.81, "grad_norm": 9.5, "learning_rate": 5.546100888510536e-07, "logits/chosen": -1.3981966972351074, "logits/rejected": -1.4832366704940796, "logps/chosen": -653.8504028320312, "logps/rejected": -755.0343627929688, "loss": 0.5811, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.004908561706543, "rewards/margins": 1.5947930812835693, "rewards/rejected": -5.599701881408691, "step": 12310 }, { "epoch": 0.81, "grad_norm": 22.625, "learning_rate": 5.510286764226466e-07, "logits/chosen": -1.380174994468689, "logits/rejected": -0.8893178701400757, "logps/chosen": -635.16650390625, "logps/rejected": -790.3191528320312, "loss": 0.4582, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.828296661376953, "rewards/margins": 1.7733871936798096, "rewards/rejected": -5.601684093475342, "step": 12320 }, { "epoch": 0.81, "grad_norm": 7.625, "learning_rate": 5.474574322618065e-07, "logits/chosen": -1.481265902519226, "logits/rejected": -0.8396648168563843, "logps/chosen": -645.6865234375, "logps/rejected": -788.9634399414062, "loss": 0.3773, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.296910047531128, "rewards/margins": 1.9829117059707642, "rewards/rejected": -5.279821872711182, "step": 12330 }, { "epoch": 0.81, "grad_norm": 11.9375, "learning_rate": 5.43896375000601e-07, "logits/chosen": -1.4928016662597656, "logits/rejected": -1.399658441543579, "logps/chosen": -771.0603637695312, "logps/rejected": -1010.2738037109375, "loss": 0.4196, "rewards/accuracies": 0.75, "rewards/chosen": -4.548229694366455, "rewards/margins": 2.340651750564575, "rewards/rejected": -6.888881683349609, "step": 12340 }, { "epoch": 0.81, "grad_norm": 9.1875, "learning_rate": 5.403455232179513e-07, "logits/chosen": -1.3793933391571045, "logits/rejected": -1.4371105432510376, "logps/chosen": -677.1878662109375, "logps/rejected": -789.11572265625, "loss": 0.5103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.997267246246338, "rewards/margins": 1.3811867237091064, "rewards/rejected": -5.378454685211182, "step": 12350 }, { "epoch": 0.81, "grad_norm": 31.875, "learning_rate": 5.368048954395316e-07, "logits/chosen": -1.5392582416534424, "logits/rejected": -1.0343365669250488, "logps/chosen": -679.5765380859375, "logps/rejected": -869.6309814453125, "loss": 0.4322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.021129131317139, "rewards/margins": 2.023092269897461, "rewards/rejected": -6.044222354888916, "step": 12360 }, { "epoch": 0.81, "grad_norm": 11.625, "learning_rate": 5.332745101376788e-07, "logits/chosen": -1.0732837915420532, "logits/rejected": -1.3295276165008545, "logps/chosen": -604.3509521484375, "logps/rejected": -810.8245239257812, "loss": 0.581, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.036755084991455, "rewards/margins": 1.8078563213348389, "rewards/rejected": -5.844611167907715, "step": 12370 }, { "epoch": 0.81, "grad_norm": 24.5, "learning_rate": 5.297543857312881e-07, "logits/chosen": -1.5340631008148193, "logits/rejected": -0.5385341644287109, "logps/chosen": -702.5737915039062, "logps/rejected": -871.7227783203125, "loss": 0.4325, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.311629295349121, "rewards/margins": 2.1040730476379395, "rewards/rejected": -6.415701866149902, "step": 12380 }, { "epoch": 0.81, "grad_norm": 20.125, "learning_rate": 5.262445405857238e-07, "logits/chosen": -1.4322452545166016, "logits/rejected": -1.5473740100860596, "logps/chosen": -623.6793212890625, "logps/rejected": -897.1373291015625, "loss": 0.3733, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.548776149749756, "rewards/margins": 2.3999645709991455, "rewards/rejected": -5.948741436004639, "step": 12390 }, { "epoch": 0.81, "grad_norm": 45.25, "learning_rate": 5.227449930127207e-07, "logits/chosen": -1.221317172050476, "logits/rejected": -1.2829525470733643, "logps/chosen": -688.2510986328125, "logps/rejected": -827.9011840820312, "loss": 0.7344, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.428099632263184, "rewards/margins": 1.645939588546753, "rewards/rejected": -6.074038505554199, "step": 12400 }, { "epoch": 0.81, "eval_logits/chosen": -1.49436616897583, "eval_logits/rejected": -1.12807035446167, "eval_logps/chosen": -677.2385864257812, "eval_logps/rejected": -839.7449951171875, "eval_loss": 0.5186539888381958, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -4.126189708709717, "eval_rewards/margins": 1.8255436420440674, "eval_rewards/rejected": -5.9517340660095215, "eval_runtime": 1082.1808, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 12400 }, { "epoch": 0.81, "grad_norm": 15.25, "learning_rate": 5.192557612702862e-07, "logits/chosen": -1.512775182723999, "logits/rejected": -0.865697979927063, "logps/chosen": -727.4715576171875, "logps/rejected": -825.7796630859375, "loss": 0.4979, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.349008560180664, "rewards/margins": 1.724826455116272, "rewards/rejected": -6.073834419250488, "step": 12410 }, { "epoch": 0.81, "grad_norm": 27.5, "learning_rate": 5.157768635626101e-07, "logits/chosen": -1.274828553199768, "logits/rejected": -0.8104740381240845, "logps/chosen": -699.3403930664062, "logps/rejected": -829.5407104492188, "loss": 0.717, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.894136905670166, "rewards/margins": 1.3177989721298218, "rewards/rejected": -6.211935997009277, "step": 12420 }, { "epoch": 0.81, "grad_norm": 15.1875, "learning_rate": 5.12308318039966e-07, "logits/chosen": -1.5512850284576416, "logits/rejected": -1.0201778411865234, "logps/chosen": -665.0433959960938, "logps/rejected": -796.9205932617188, "loss": 0.4524, "rewards/accuracies": 0.75, "rewards/chosen": -3.633652925491333, "rewards/margins": 2.0377564430236816, "rewards/rejected": -5.671409606933594, "step": 12430 }, { "epoch": 0.81, "grad_norm": 47.5, "learning_rate": 5.088501427986181e-07, "logits/chosen": -1.3308649063110352, "logits/rejected": -0.9429399371147156, "logps/chosen": -664.4088134765625, "logps/rejected": -902.2447509765625, "loss": 0.6767, "rewards/accuracies": 0.75, "rewards/chosen": -4.446482181549072, "rewards/margins": 2.2031075954437256, "rewards/rejected": -6.649589538574219, "step": 12440 }, { "epoch": 0.81, "grad_norm": 25.25, "learning_rate": 5.054023558807241e-07, "logits/chosen": -1.457533836364746, "logits/rejected": -0.876889705657959, "logps/chosen": -751.9478759765625, "logps/rejected": -821.1819458007812, "loss": 0.5234, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.728918552398682, "rewards/margins": 1.1833834648132324, "rewards/rejected": -5.912302494049072, "step": 12450 }, { "epoch": 0.82, "grad_norm": 7.1875, "learning_rate": 5.019649752742461e-07, "logits/chosen": -1.839120626449585, "logits/rejected": -1.5246185064315796, "logps/chosen": -656.01904296875, "logps/rejected": -861.8601684570312, "loss": 0.4754, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.879534959793091, "rewards/margins": 1.8596702814102173, "rewards/rejected": -5.739205837249756, "step": 12460 }, { "epoch": 0.82, "grad_norm": 50.5, "learning_rate": 4.985380189128525e-07, "logits/chosen": -1.4253429174423218, "logits/rejected": -1.2836852073669434, "logps/chosen": -618.6815185546875, "logps/rejected": -769.9627075195312, "loss": 0.5632, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.048267364501953, "rewards/margins": 1.572575330734253, "rewards/rejected": -5.620842933654785, "step": 12470 }, { "epoch": 0.82, "grad_norm": 25.75, "learning_rate": 4.951215046758257e-07, "logits/chosen": -1.7281115055084229, "logits/rejected": -1.4013159275054932, "logps/chosen": -709.0345458984375, "logps/rejected": -914.5965576171875, "loss": 0.3907, "rewards/accuracies": 0.75, "rewards/chosen": -4.164641857147217, "rewards/margins": 2.2868521213531494, "rewards/rejected": -6.4514946937561035, "step": 12480 }, { "epoch": 0.82, "grad_norm": 9.1875, "learning_rate": 4.917154503879695e-07, "logits/chosen": -1.4673848152160645, "logits/rejected": -1.2114064693450928, "logps/chosen": -671.1936645507812, "logps/rejected": -796.7606201171875, "loss": 0.6627, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.3222336769104, "rewards/margins": 1.2201950550079346, "rewards/rejected": -5.542428970336914, "step": 12490 }, { "epoch": 0.82, "grad_norm": 3.78125, "learning_rate": 4.883198738195157e-07, "logits/chosen": -1.7344276905059814, "logits/rejected": -1.064745306968689, "logps/chosen": -643.7562255859375, "logps/rejected": -779.2528076171875, "loss": 0.4667, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.786107301712036, "rewards/margins": 1.9073196649551392, "rewards/rejected": -5.693426609039307, "step": 12500 }, { "epoch": 0.82, "eval_logits/chosen": -1.4972352981567383, "eval_logits/rejected": -1.1315851211547852, "eval_logps/chosen": -674.338134765625, "eval_logps/rejected": -835.1400146484375, "eval_loss": 0.5171301364898682, "eval_rewards/accuracies": 0.7475000023841858, "eval_rewards/chosen": -4.0971856117248535, "eval_rewards/margins": 1.8084977865219116, "eval_rewards/rejected": -5.905683994293213, "eval_runtime": 1093.6221, "eval_samples_per_second": 1.829, "eval_steps_per_second": 1.829, "step": 12500 }, { "epoch": 0.82, "grad_norm": 38.5, "learning_rate": 4.849347926860296e-07, "logits/chosen": -1.2415263652801514, "logits/rejected": -0.7254308462142944, "logps/chosen": -685.3446044921875, "logps/rejected": -867.6388549804688, "loss": 0.5719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.3415679931640625, "rewards/margins": 1.7631118297576904, "rewards/rejected": -6.104680061340332, "step": 12510 }, { "epoch": 0.82, "grad_norm": 14.875, "learning_rate": 4.815602246483211e-07, "logits/chosen": -1.6386171579360962, "logits/rejected": -0.6473469734191895, "logps/chosen": -657.8793334960938, "logps/rejected": -858.25048828125, "loss": 0.3379, "rewards/accuracies": 0.875, "rewards/chosen": -3.655412197113037, "rewards/margins": 2.739349126815796, "rewards/rejected": -6.394761085510254, "step": 12520 }, { "epoch": 0.82, "grad_norm": 6.90625, "learning_rate": 4.781961873123506e-07, "logits/chosen": -1.5448219776153564, "logits/rejected": -1.0056270360946655, "logps/chosen": -702.23046875, "logps/rejected": -851.0791015625, "loss": 0.6019, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.241183280944824, "rewards/margins": 1.9120140075683594, "rewards/rejected": -6.153197288513184, "step": 12530 }, { "epoch": 0.82, "grad_norm": 11.875, "learning_rate": 4.748426982291354e-07, "logits/chosen": -1.628103494644165, "logits/rejected": -0.9380465745925903, "logps/chosen": -745.520263671875, "logps/rejected": -919.7072143554688, "loss": 0.4898, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.75929594039917, "rewards/margins": 1.8521884679794312, "rewards/rejected": -6.611484527587891, "step": 12540 }, { "epoch": 0.82, "grad_norm": 6.25, "learning_rate": 4.714997748946615e-07, "logits/chosen": -1.689292311668396, "logits/rejected": -0.7681092023849487, "logps/chosen": -673.8282470703125, "logps/rejected": -850.1197509765625, "loss": 0.4289, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.053454399108887, "rewards/margins": 2.1432976722717285, "rewards/rejected": -6.196751594543457, "step": 12550 }, { "epoch": 0.82, "grad_norm": 27.75, "learning_rate": 4.681674347497914e-07, "logits/chosen": -1.6710712909698486, "logits/rejected": -1.4516799449920654, "logps/chosen": -740.8311767578125, "logps/rejected": -924.17431640625, "loss": 0.4237, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.395913124084473, "rewards/margins": 2.2063040733337402, "rewards/rejected": -6.602217197418213, "step": 12560 }, { "epoch": 0.82, "grad_norm": 9.1875, "learning_rate": 4.648456951801697e-07, "logits/chosen": -1.323366403579712, "logits/rejected": -0.9112518429756165, "logps/chosen": -594.5243530273438, "logps/rejected": -701.0707397460938, "loss": 0.6359, "rewards/accuracies": 0.75, "rewards/chosen": -3.864246368408203, "rewards/margins": 1.4721548557281494, "rewards/rejected": -5.336400985717773, "step": 12570 }, { "epoch": 0.82, "grad_norm": 9.125, "learning_rate": 4.615345735161375e-07, "logits/chosen": -1.51564621925354, "logits/rejected": -1.0455206632614136, "logps/chosen": -650.6685791015625, "logps/rejected": -870.9066162109375, "loss": 0.4468, "rewards/accuracies": 0.75, "rewards/chosen": -3.68034029006958, "rewards/margins": 2.409252643585205, "rewards/rejected": -6.089592933654785, "step": 12580 }, { "epoch": 0.82, "grad_norm": 19.625, "learning_rate": 4.5823408703264073e-07, "logits/chosen": -1.290935754776001, "logits/rejected": -1.3564382791519165, "logps/chosen": -630.6192016601562, "logps/rejected": -842.8318481445312, "loss": 0.5217, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9717793464660645, "rewards/margins": 1.8552792072296143, "rewards/rejected": -5.8270583152771, "step": 12590 }, { "epoch": 0.82, "grad_norm": 9.875, "learning_rate": 4.549442529491352e-07, "logits/chosen": -1.9019479751586914, "logits/rejected": -1.3907419443130493, "logps/chosen": -728.08349609375, "logps/rejected": -883.9122924804688, "loss": 0.5658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.508058547973633, "rewards/margins": 1.787693738937378, "rewards/rejected": -5.29575252532959, "step": 12600 }, { "epoch": 0.82, "eval_logits/chosen": -1.4965088367462158, "eval_logits/rejected": -1.1300994157791138, "eval_logps/chosen": -675.2821655273438, "eval_logps/rejected": -836.3403930664062, "eval_loss": 0.5172280669212341, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -4.106625556945801, "eval_rewards/margins": 1.8110620975494385, "eval_rewards/rejected": -5.91768741607666, "eval_runtime": 1082.8834, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 12600 }, { "epoch": 0.83, "grad_norm": 27.125, "learning_rate": 4.5166508842950345e-07, "logits/chosen": -1.6887989044189453, "logits/rejected": -1.703626036643982, "logps/chosen": -664.2781372070312, "logps/rejected": -681.11328125, "loss": 0.8148, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.8737645149230957, "rewards/margins": 0.5770565271377563, "rewards/rejected": -4.4508209228515625, "step": 12610 }, { "epoch": 0.83, "grad_norm": 12.6875, "learning_rate": 4.4839661058196204e-07, "logits/chosen": -1.3557672500610352, "logits/rejected": -1.199262261390686, "logps/chosen": -665.2078247070312, "logps/rejected": -832.7658081054688, "loss": 0.3799, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.008880138397217, "rewards/margins": 1.9919188022613525, "rewards/rejected": -6.00079870223999, "step": 12620 }, { "epoch": 0.83, "grad_norm": 7.21875, "learning_rate": 4.4513883645897067e-07, "logits/chosen": -1.3051261901855469, "logits/rejected": -1.305677890777588, "logps/chosen": -662.8770751953125, "logps/rejected": -824.8167724609375, "loss": 0.5952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.012806415557861, "rewards/margins": 1.647301435470581, "rewards/rejected": -5.6601080894470215, "step": 12630 }, { "epoch": 0.83, "grad_norm": 82.5, "learning_rate": 4.418917830571468e-07, "logits/chosen": -1.5561306476593018, "logits/rejected": -1.4859167337417603, "logps/chosen": -598.1131591796875, "logps/rejected": -799.564453125, "loss": 0.4834, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.7001729011535645, "rewards/margins": 1.8898242712020874, "rewards/rejected": -5.589997291564941, "step": 12640 }, { "epoch": 0.83, "grad_norm": 7.8125, "learning_rate": 4.38655467317175e-07, "logits/chosen": -1.31911039352417, "logits/rejected": -1.0993320941925049, "logps/chosen": -719.1732788085938, "logps/rejected": -890.9327392578125, "loss": 0.4272, "rewards/accuracies": 0.75, "rewards/chosen": -4.33695125579834, "rewards/margins": 2.1743664741516113, "rewards/rejected": -6.511318206787109, "step": 12650 }, { "epoch": 0.83, "grad_norm": 25.375, "learning_rate": 4.354299061237177e-07, "logits/chosen": -1.1850215196609497, "logits/rejected": -0.6495458483695984, "logps/chosen": -670.984619140625, "logps/rejected": -847.2431640625, "loss": 0.3623, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.045533180236816, "rewards/margins": 2.0306758880615234, "rewards/rejected": -6.076208591461182, "step": 12660 }, { "epoch": 0.83, "grad_norm": 31.625, "learning_rate": 4.3221511630532955e-07, "logits/chosen": -1.7848049402236938, "logits/rejected": -1.2645317316055298, "logps/chosen": -681.5743408203125, "logps/rejected": -754.2684936523438, "loss": 0.4328, "rewards/accuracies": 0.75, "rewards/chosen": -3.9557137489318848, "rewards/margins": 1.3707351684570312, "rewards/rejected": -5.326448917388916, "step": 12670 }, { "epoch": 0.83, "grad_norm": 18.125, "learning_rate": 4.290111146343673e-07, "logits/chosen": -1.4899415969848633, "logits/rejected": -1.0903269052505493, "logps/chosen": -636.47705078125, "logps/rejected": -777.0272216796875, "loss": 0.5606, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.226902484893799, "rewards/margins": 1.4171861410140991, "rewards/rejected": -5.644087791442871, "step": 12680 }, { "epoch": 0.83, "grad_norm": 23.25, "learning_rate": 4.258179178269037e-07, "logits/chosen": -1.687951683998108, "logits/rejected": -0.9723125696182251, "logps/chosen": -660.561279296875, "logps/rejected": -818.4022216796875, "loss": 0.5622, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.900709867477417, "rewards/margins": 1.7832008600234985, "rewards/rejected": -5.683910846710205, "step": 12690 }, { "epoch": 0.83, "grad_norm": 43.5, "learning_rate": 4.226355425426398e-07, "logits/chosen": -1.785590410232544, "logits/rejected": -0.9777039289474487, "logps/chosen": -680.770263671875, "logps/rejected": -817.2718505859375, "loss": 0.6554, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.0570268630981445, "rewards/margins": 1.418658971786499, "rewards/rejected": -5.475686550140381, "step": 12700 }, { "epoch": 0.83, "eval_logits/chosen": -1.4943361282348633, "eval_logits/rejected": -1.1282639503479004, "eval_logps/chosen": -675.9285888671875, "eval_logps/rejected": -836.6074829101562, "eval_loss": 0.5166574716567993, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.1130900382995605, "eval_rewards/margins": 1.8072683811187744, "eval_rewards/rejected": -5.920358180999756, "eval_runtime": 1082.6655, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 12700 }, { "epoch": 0.83, "grad_norm": 52.75, "learning_rate": 4.19464005384819e-07, "logits/chosen": -1.921164870262146, "logits/rejected": -1.2839831113815308, "logps/chosen": -780.373779296875, "logps/rejected": -919.9296875, "loss": 0.5622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.273499488830566, "rewards/margins": 1.4658682346343994, "rewards/rejected": -5.739367485046387, "step": 12710 }, { "epoch": 0.83, "grad_norm": 111.5, "learning_rate": 4.16303322900137e-07, "logits/chosen": -1.4493604898452759, "logits/rejected": -0.8644909858703613, "logps/chosen": -749.5689697265625, "logps/rejected": -800.6668701171875, "loss": 1.0842, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.981212615966797, "rewards/margins": 0.9572421908378601, "rewards/rejected": -5.938455104827881, "step": 12720 }, { "epoch": 0.83, "grad_norm": 29.25, "learning_rate": 4.1315351157866003e-07, "logits/chosen": -1.2041301727294922, "logits/rejected": -0.9239950180053711, "logps/chosen": -657.1053466796875, "logps/rejected": -752.8463134765625, "loss": 0.6262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.948913097381592, "rewards/margins": 1.5130994319915771, "rewards/rejected": -5.462012767791748, "step": 12730 }, { "epoch": 0.83, "grad_norm": 39.25, "learning_rate": 4.10014587853737e-07, "logits/chosen": -1.0515629053115845, "logits/rejected": -1.2130296230316162, "logps/chosen": -598.2210693359375, "logps/rejected": -765.2221069335938, "loss": 0.5382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.836686372756958, "rewards/margins": 1.6214301586151123, "rewards/rejected": -5.45811653137207, "step": 12740 }, { "epoch": 0.83, "grad_norm": 19.875, "learning_rate": 4.0688656810191157e-07, "logits/chosen": -1.6691042184829712, "logits/rejected": -1.4067200422286987, "logps/chosen": -712.8956298828125, "logps/rejected": -905.08935546875, "loss": 0.5793, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.043822765350342, "rewards/margins": 2.250225782394409, "rewards/rejected": -6.294048309326172, "step": 12750 }, { "epoch": 0.83, "grad_norm": 12.5625, "learning_rate": 4.037694686428406e-07, "logits/chosen": -1.389685034751892, "logits/rejected": -1.03188955783844, "logps/chosen": -738.7546997070312, "logps/rejected": -923.1394653320312, "loss": 0.4241, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.407435894012451, "rewards/margins": 1.9963579177856445, "rewards/rejected": -6.403794288635254, "step": 12760 }, { "epoch": 0.84, "grad_norm": 34.0, "learning_rate": 4.0066330573920593e-07, "logits/chosen": -1.4484285116195679, "logits/rejected": -1.2367875576019287, "logps/chosen": -654.9096069335938, "logps/rejected": -768.2120361328125, "loss": 0.5258, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9301371574401855, "rewards/margins": 1.4494191408157349, "rewards/rejected": -5.379557132720947, "step": 12770 }, { "epoch": 0.84, "grad_norm": 19.125, "learning_rate": 3.9756809559663185e-07, "logits/chosen": -1.24978768825531, "logits/rejected": -1.17453932762146, "logps/chosen": -619.5692138671875, "logps/rejected": -833.2843627929688, "loss": 0.4356, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.02880859375, "rewards/margins": 2.0331053733825684, "rewards/rejected": -6.061914443969727, "step": 12780 }, { "epoch": 0.84, "grad_norm": 28.625, "learning_rate": 3.944838543635976e-07, "logits/chosen": -1.3630785942077637, "logits/rejected": -1.692875862121582, "logps/chosen": -639.0132446289062, "logps/rejected": -823.0880737304688, "loss": 0.6255, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.875023365020752, "rewards/margins": 1.8052186965942383, "rewards/rejected": -5.680241584777832, "step": 12790 }, { "epoch": 0.84, "grad_norm": 19.0, "learning_rate": 3.9141059813135653e-07, "logits/chosen": -1.582597255706787, "logits/rejected": -1.4261318445205688, "logps/chosen": -632.0634155273438, "logps/rejected": -809.0720825195312, "loss": 0.5481, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8898863792419434, "rewards/margins": 1.921811819076538, "rewards/rejected": -5.811697483062744, "step": 12800 }, { "epoch": 0.84, "eval_logits/chosen": -1.5030485391616821, "eval_logits/rejected": -1.1393755674362183, "eval_logps/chosen": -672.578857421875, "eval_logps/rejected": -831.3081665039062, "eval_loss": 0.5153519511222839, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.079592704772949, "eval_rewards/margins": 1.7877726554870605, "eval_rewards/rejected": -5.86736536026001, "eval_runtime": 1086.5598, "eval_samples_per_second": 1.841, "eval_steps_per_second": 1.841, "step": 12800 }, { "epoch": 0.84, "grad_norm": 15.9375, "learning_rate": 3.8834834293384946e-07, "logits/chosen": -1.3257964849472046, "logits/rejected": -1.122748613357544, "logps/chosen": -662.0810546875, "logps/rejected": -846.8050537109375, "loss": 0.4149, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.8199825286865234, "rewards/margins": 2.079073429107666, "rewards/rejected": -5.899056434631348, "step": 12810 }, { "epoch": 0.84, "grad_norm": 34.0, "learning_rate": 3.8529710474762253e-07, "logits/chosen": -1.4549511671066284, "logits/rejected": -1.3545879125595093, "logps/chosen": -584.5658569335938, "logps/rejected": -801.1481323242188, "loss": 0.4429, "rewards/accuracies": 0.875, "rewards/chosen": -3.4955451488494873, "rewards/margins": 2.0276026725769043, "rewards/rejected": -5.5231475830078125, "step": 12820 }, { "epoch": 0.84, "grad_norm": 25.5, "learning_rate": 3.822568994917439e-07, "logits/chosen": -1.6016972064971924, "logits/rejected": -1.4214098453521729, "logps/chosen": -713.5632934570312, "logps/rejected": -852.3037109375, "loss": 0.5147, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.043216705322266, "rewards/margins": 1.6596847772598267, "rewards/rejected": -5.7029008865356445, "step": 12830 }, { "epoch": 0.84, "grad_norm": 21.5, "learning_rate": 3.792277430277197e-07, "logits/chosen": -1.1923797130584717, "logits/rejected": -0.9504438638687134, "logps/chosen": -656.5103759765625, "logps/rejected": -792.1350708007812, "loss": 0.6209, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.149816513061523, "rewards/margins": 1.574987530708313, "rewards/rejected": -5.724803924560547, "step": 12840 }, { "epoch": 0.84, "grad_norm": 17.625, "learning_rate": 3.76209651159411e-07, "logits/chosen": -1.7556816339492798, "logits/rejected": -1.1585263013839722, "logps/chosen": -696.5750732421875, "logps/rejected": -807.4962158203125, "loss": 0.5521, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.9256744384765625, "rewards/margins": 1.4886971712112427, "rewards/rejected": -5.414371490478516, "step": 12850 }, { "epoch": 0.84, "grad_norm": 22.875, "learning_rate": 3.73202639632953e-07, "logits/chosen": -1.0938459634780884, "logits/rejected": -0.9320684671401978, "logps/chosen": -703.521240234375, "logps/rejected": -845.0142822265625, "loss": 0.5379, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.344259738922119, "rewards/margins": 1.6624557971954346, "rewards/rejected": -6.006714820861816, "step": 12860 }, { "epoch": 0.84, "grad_norm": 22.125, "learning_rate": 3.702067241366725e-07, "logits/chosen": -1.7636568546295166, "logits/rejected": -0.9995309114456177, "logps/chosen": -672.2806396484375, "logps/rejected": -765.2754516601562, "loss": 0.4934, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.792029619216919, "rewards/margins": 1.4832711219787598, "rewards/rejected": -5.275300979614258, "step": 12870 }, { "epoch": 0.84, "grad_norm": 37.5, "learning_rate": 3.6722192030100386e-07, "logits/chosen": -1.3920162916183472, "logits/rejected": -1.0627715587615967, "logps/chosen": -664.1403198242188, "logps/rejected": -844.4947509765625, "loss": 0.3356, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.6983249187469482, "rewards/margins": 2.238010883331299, "rewards/rejected": -5.936335563659668, "step": 12880 }, { "epoch": 0.84, "grad_norm": 17.25, "learning_rate": 3.642482436984107e-07, "logits/chosen": -1.6202392578125, "logits/rejected": -1.7871360778808594, "logps/chosen": -669.7159423828125, "logps/rejected": -816.4592895507812, "loss": 0.5371, "rewards/accuracies": 0.75, "rewards/chosen": -3.80824613571167, "rewards/margins": 1.6600383520126343, "rewards/rejected": -5.468285083770752, "step": 12890 }, { "epoch": 0.84, "grad_norm": 6.15625, "learning_rate": 3.6128570984330343e-07, "logits/chosen": -1.6146312952041626, "logits/rejected": -1.0325052738189697, "logps/chosen": -640.1663818359375, "logps/rejected": -913.9518432617188, "loss": 0.3902, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.9911742210388184, "rewards/margins": 2.67854380607605, "rewards/rejected": -6.669718265533447, "step": 12900 }, { "epoch": 0.84, "eval_logits/chosen": -1.5025445222854614, "eval_logits/rejected": -1.1385281085968018, "eval_logps/chosen": -672.0549926757812, "eval_logps/rejected": -831.2067260742188, "eval_loss": 0.5154820680618286, "eval_rewards/accuracies": 0.7484999895095825, "eval_rewards/chosen": -4.0743536949157715, "eval_rewards/margins": 1.7919962406158447, "eval_rewards/rejected": -5.866350173950195, "eval_runtime": 1082.9527, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 12900 }, { "epoch": 0.84, "grad_norm": 9.75, "learning_rate": 3.5833433419195514e-07, "logits/chosen": -1.424258828163147, "logits/rejected": -0.26776123046875, "logps/chosen": -742.7805786132812, "logps/rejected": -784.5831298828125, "loss": 0.6708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.680123329162598, "rewards/margins": 1.244545578956604, "rewards/rejected": -5.92466926574707, "step": 12910 }, { "epoch": 0.85, "grad_norm": 43.25, "learning_rate": 3.5539413214242755e-07, "logits/chosen": -1.2278188467025757, "logits/rejected": -1.324303388595581, "logps/chosen": -715.6702270507812, "logps/rejected": -848.7413330078125, "loss": 0.5673, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.276078224182129, "rewards/margins": 1.6185306310653687, "rewards/rejected": -5.894608974456787, "step": 12920 }, { "epoch": 0.85, "grad_norm": 30.75, "learning_rate": 3.524651190344852e-07, "logits/chosen": -1.3894983530044556, "logits/rejected": -1.1137068271636963, "logps/chosen": -679.8286743164062, "logps/rejected": -901.2341918945312, "loss": 0.6069, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.32813835144043, "rewards/margins": 1.9111255407333374, "rewards/rejected": -6.239263534545898, "step": 12930 }, { "epoch": 0.85, "grad_norm": 12.25, "learning_rate": 3.4954731014951586e-07, "logits/chosen": -1.3717103004455566, "logits/rejected": -0.986125111579895, "logps/chosen": -638.8120727539062, "logps/rejected": -867.0013427734375, "loss": 0.3663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7314419746398926, "rewards/margins": 2.1722311973571777, "rewards/rejected": -5.90367317199707, "step": 12940 }, { "epoch": 0.85, "grad_norm": 16.625, "learning_rate": 3.4664072071045413e-07, "logits/chosen": -1.6115827560424805, "logits/rejected": -1.0516890287399292, "logps/chosen": -612.7985229492188, "logps/rejected": -794.4291381835938, "loss": 0.3862, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.495833158493042, "rewards/margins": 2.168839693069458, "rewards/rejected": -5.664672374725342, "step": 12950 }, { "epoch": 0.85, "grad_norm": 17.25, "learning_rate": 3.437453658816994e-07, "logits/chosen": -1.4947166442871094, "logits/rejected": -0.9751759767532349, "logps/chosen": -605.2887573242188, "logps/rejected": -822.3523559570312, "loss": 0.4493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8558642864227295, "rewards/margins": 2.2588956356048584, "rewards/rejected": -6.114760398864746, "step": 12960 }, { "epoch": 0.85, "grad_norm": 37.0, "learning_rate": 3.408612607690365e-07, "logits/chosen": -1.223359227180481, "logits/rejected": -0.9667221903800964, "logps/chosen": -758.9775390625, "logps/rejected": -900.5115356445312, "loss": 0.8307, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.05471134185791, "rewards/margins": 1.1281535625457764, "rewards/rejected": -6.182865142822266, "step": 12970 }, { "epoch": 0.85, "grad_norm": 32.5, "learning_rate": 3.3798842041955825e-07, "logits/chosen": -1.666280746459961, "logits/rejected": -1.418423056602478, "logps/chosen": -683.7225341796875, "logps/rejected": -798.1446533203125, "loss": 0.5075, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.029163837432861, "rewards/margins": 1.7276551723480225, "rewards/rejected": -5.756818771362305, "step": 12980 }, { "epoch": 0.85, "grad_norm": 22.25, "learning_rate": 3.3512685982158733e-07, "logits/chosen": -1.3196337223052979, "logits/rejected": -1.3326494693756104, "logps/chosen": -687.1990966796875, "logps/rejected": -777.4107055664062, "loss": 0.586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.1333465576171875, "rewards/margins": 1.195655107498169, "rewards/rejected": -5.329002380371094, "step": 12990 }, { "epoch": 0.85, "grad_norm": 14.75, "learning_rate": 3.322765939045952e-07, "logits/chosen": -1.708003044128418, "logits/rejected": -1.042566180229187, "logps/chosen": -701.4783935546875, "logps/rejected": -826.1650390625, "loss": 0.3801, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.001893043518066, "rewards/margins": 2.104011058807373, "rewards/rejected": -6.105904579162598, "step": 13000 }, { "epoch": 0.85, "eval_logits/chosen": -1.5056382417678833, "eval_logits/rejected": -1.1421719789505005, "eval_logps/chosen": -670.4493408203125, "eval_logps/rejected": -829.2069091796875, "eval_loss": 0.5154767632484436, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -4.058297157287598, "eval_rewards/margins": 1.7880553007125854, "eval_rewards/rejected": -5.846351623535156, "eval_runtime": 1082.6663, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 13000 }, { "epoch": 0.85, "grad_norm": 19.5, "learning_rate": 3.294376375391278e-07, "logits/chosen": -1.5306568145751953, "logits/rejected": -0.9261484146118164, "logps/chosen": -649.072509765625, "logps/rejected": -827.5198364257812, "loss": 0.4678, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.043013572692871, "rewards/margins": 2.2800557613372803, "rewards/rejected": -6.3230695724487305, "step": 13010 }, { "epoch": 0.85, "grad_norm": 47.5, "learning_rate": 3.266100055367255e-07, "logits/chosen": -1.4810465574264526, "logits/rejected": -1.265726089477539, "logps/chosen": -620.5711669921875, "logps/rejected": -815.60791015625, "loss": 0.4909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.157571792602539, "rewards/margins": 1.8569015264511108, "rewards/rejected": -6.014473915100098, "step": 13020 }, { "epoch": 0.85, "grad_norm": 5.96875, "learning_rate": 3.2379371264984694e-07, "logits/chosen": -1.7722549438476562, "logits/rejected": -1.1962707042694092, "logps/chosen": -602.5084228515625, "logps/rejected": -815.4844970703125, "loss": 0.4614, "rewards/accuracies": 0.75, "rewards/chosen": -3.630559206008911, "rewards/margins": 2.287370443344116, "rewards/rejected": -5.917929649353027, "step": 13030 }, { "epoch": 0.85, "grad_norm": 23.625, "learning_rate": 3.209887735717918e-07, "logits/chosen": -1.7228397130966187, "logits/rejected": -1.0715776681900024, "logps/chosen": -637.7086181640625, "logps/rejected": -760.956298828125, "loss": 0.5703, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.7957091331481934, "rewards/margins": 1.3664389848709106, "rewards/rejected": -5.162148475646973, "step": 13040 }, { "epoch": 0.85, "grad_norm": 19.875, "learning_rate": 3.181952029366248e-07, "logits/chosen": -1.178336501121521, "logits/rejected": -1.3111517429351807, "logps/chosen": -642.4066162109375, "logps/rejected": -818.6258544921875, "loss": 0.6081, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.1842145919799805, "rewards/margins": 1.145550012588501, "rewards/rejected": -5.329764366149902, "step": 13050 }, { "epoch": 0.85, "grad_norm": 5.5, "learning_rate": 3.154130153190968e-07, "logits/chosen": -1.066184639930725, "logits/rejected": -0.9536517858505249, "logps/chosen": -619.2181396484375, "logps/rejected": -892.4988403320312, "loss": 0.3429, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7910423278808594, "rewards/margins": 2.6539740562438965, "rewards/rejected": -6.445016384124756, "step": 13060 }, { "epoch": 0.86, "grad_norm": 8.9375, "learning_rate": 3.1264222523457206e-07, "logits/chosen": -1.742893934249878, "logits/rejected": -1.4096717834472656, "logps/chosen": -727.0240478515625, "logps/rejected": -870.3107299804688, "loss": 0.5201, "rewards/accuracies": 0.75, "rewards/chosen": -4.318058967590332, "rewards/margins": 1.778149962425232, "rewards/rejected": -6.0962090492248535, "step": 13070 }, { "epoch": 0.86, "grad_norm": 25.5, "learning_rate": 3.098828471389509e-07, "logits/chosen": -1.3382033109664917, "logits/rejected": -1.2017595767974854, "logps/chosen": -662.73876953125, "logps/rejected": -868.0599365234375, "loss": 0.5213, "rewards/accuracies": 0.75, "rewards/chosen": -4.07410192489624, "rewards/margins": 2.0450592041015625, "rewards/rejected": -6.1191606521606445, "step": 13080 }, { "epoch": 0.86, "grad_norm": 7.9375, "learning_rate": 3.071348954285949e-07, "logits/chosen": -1.535391092300415, "logits/rejected": -1.2895714044570923, "logps/chosen": -614.9608154296875, "logps/rejected": -803.8629150390625, "loss": 0.574, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.740224838256836, "rewards/margins": 1.8981698751449585, "rewards/rejected": -5.638394355773926, "step": 13090 }, { "epoch": 0.86, "grad_norm": 17.75, "learning_rate": 3.0439838444024983e-07, "logits/chosen": -1.4118118286132812, "logits/rejected": -1.5126476287841797, "logps/chosen": -590.5972290039062, "logps/rejected": -750.5953369140625, "loss": 0.6991, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.89982533454895, "rewards/margins": 1.1269371509552002, "rewards/rejected": -5.026762008666992, "step": 13100 }, { "epoch": 0.86, "eval_logits/chosen": -1.5068999528884888, "eval_logits/rejected": -1.1434739828109741, "eval_logps/chosen": -669.7777709960938, "eval_logps/rejected": -828.6917114257812, "eval_loss": 0.5154184699058533, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -4.051581859588623, "eval_rewards/margins": 1.7896184921264648, "eval_rewards/rejected": -5.84119987487793, "eval_runtime": 1082.3184, "eval_samples_per_second": 1.848, "eval_steps_per_second": 1.848, "step": 13100 }, { "epoch": 0.86, "grad_norm": 31.875, "learning_rate": 3.0167332845097376e-07, "logits/chosen": -1.2860263586044312, "logits/rejected": -1.2891541719436646, "logps/chosen": -692.1019287109375, "logps/rejected": -832.9024658203125, "loss": 0.5541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.078530788421631, "rewards/margins": 1.525813102722168, "rewards/rejected": -5.604344367980957, "step": 13110 }, { "epoch": 0.86, "grad_norm": 15.125, "learning_rate": 2.98959741678061e-07, "logits/chosen": -1.6061878204345703, "logits/rejected": -1.1270813941955566, "logps/chosen": -715.585693359375, "logps/rejected": -826.5067138671875, "loss": 0.5325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.194059371948242, "rewards/margins": 1.922698974609375, "rewards/rejected": -6.116758823394775, "step": 13120 }, { "epoch": 0.86, "grad_norm": 25.375, "learning_rate": 2.9625763827896695e-07, "logits/chosen": -1.4923087358474731, "logits/rejected": -0.768844485282898, "logps/chosen": -639.72802734375, "logps/rejected": -844.6573486328125, "loss": 0.3333, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.8699536323547363, "rewards/margins": 2.415266275405884, "rewards/rejected": -6.285220146179199, "step": 13130 }, { "epoch": 0.86, "grad_norm": 20.875, "learning_rate": 2.935670323512377e-07, "logits/chosen": -1.4547690153121948, "logits/rejected": -1.2718137502670288, "logps/chosen": -641.06298828125, "logps/rejected": -852.6295776367188, "loss": 0.5301, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8585429191589355, "rewards/margins": 2.3552443981170654, "rewards/rejected": -6.21378755569458, "step": 13140 }, { "epoch": 0.86, "grad_norm": 38.5, "learning_rate": 2.908879379324317e-07, "logits/chosen": -1.4247170686721802, "logits/rejected": -1.241533875465393, "logps/chosen": -665.2776489257812, "logps/rejected": -846.8787231445312, "loss": 0.47, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.8799641132354736, "rewards/margins": 1.964072823524475, "rewards/rejected": -5.844037055969238, "step": 13150 }, { "epoch": 0.86, "grad_norm": 64.0, "learning_rate": 2.882203690000507e-07, "logits/chosen": -1.3361215591430664, "logits/rejected": -0.9772714376449585, "logps/chosen": -635.6947021484375, "logps/rejected": -889.3577270507812, "loss": 0.2796, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7453525066375732, "rewards/margins": 2.5614285469055176, "rewards/rejected": -6.306780815124512, "step": 13160 }, { "epoch": 0.86, "grad_norm": 34.0, "learning_rate": 2.855643394714638e-07, "logits/chosen": -1.2784361839294434, "logits/rejected": -1.1720725297927856, "logps/chosen": -727.8234252929688, "logps/rejected": -796.2040405273438, "loss": 0.8318, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.507246971130371, "rewards/margins": 0.5959983468055725, "rewards/rejected": -5.103245258331299, "step": 13170 }, { "epoch": 0.86, "grad_norm": 16.375, "learning_rate": 2.8291986320383824e-07, "logits/chosen": -1.4929214715957642, "logits/rejected": -1.1983537673950195, "logps/chosen": -618.2957763671875, "logps/rejected": -804.4884033203125, "loss": 0.4107, "rewards/accuracies": 0.75, "rewards/chosen": -4.022196292877197, "rewards/margins": 1.793534517288208, "rewards/rejected": -5.815731048583984, "step": 13180 }, { "epoch": 0.86, "grad_norm": 11.3125, "learning_rate": 2.80286953994062e-07, "logits/chosen": -1.6695266962051392, "logits/rejected": -1.0117148160934448, "logps/chosen": -610.8551025390625, "logps/rejected": -719.1248168945312, "loss": 0.5354, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7052032947540283, "rewards/margins": 1.6667282581329346, "rewards/rejected": -5.371931552886963, "step": 13190 }, { "epoch": 0.86, "grad_norm": 48.5, "learning_rate": 2.776656255786775e-07, "logits/chosen": -1.4783408641815186, "logits/rejected": -1.2065401077270508, "logps/chosen": -607.0650634765625, "logps/rejected": -782.0667114257812, "loss": 0.472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.699052333831787, "rewards/margins": 2.0071704387664795, "rewards/rejected": -5.706223487854004, "step": 13200 }, { "epoch": 0.86, "eval_logits/chosen": -1.5045762062072754, "eval_logits/rejected": -1.1407368183135986, "eval_logps/chosen": -669.9542846679688, "eval_logps/rejected": -829.1138305664062, "eval_loss": 0.5151001811027527, "eval_rewards/accuracies": 0.7484999895095825, "eval_rewards/chosen": -4.053346157073975, "eval_rewards/margins": 1.7920758724212646, "eval_rewards/rejected": -5.845422267913818, "eval_runtime": 1082.645, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 13200 }, { "epoch": 0.86, "grad_norm": 37.75, "learning_rate": 2.7505589163380643e-07, "logits/chosen": -1.5894315242767334, "logits/rejected": -1.4719792604446411, "logps/chosen": -740.5426025390625, "logps/rejected": -854.9299926757812, "loss": 0.5457, "rewards/accuracies": 0.75, "rewards/chosen": -4.116081714630127, "rewards/margins": 1.2735066413879395, "rewards/rejected": -5.389588356018066, "step": 13210 }, { "epoch": 0.86, "grad_norm": 17.5, "learning_rate": 2.724577657750782e-07, "logits/chosen": -1.3791464567184448, "logits/rejected": -1.1992027759552002, "logps/chosen": -604.3084716796875, "logps/rejected": -818.7472534179688, "loss": 0.5431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.9501006603240967, "rewards/margins": 1.7296159267425537, "rewards/rejected": -5.67971658706665, "step": 13220 }, { "epoch": 0.87, "grad_norm": 21.125, "learning_rate": 2.6987126155756133e-07, "logits/chosen": -1.7427043914794922, "logits/rejected": -1.1716797351837158, "logps/chosen": -657.875732421875, "logps/rejected": -752.6412353515625, "loss": 0.421, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.816807270050049, "rewards/margins": 1.6463342905044556, "rewards/rejected": -5.463141441345215, "step": 13230 }, { "epoch": 0.87, "grad_norm": 25.75, "learning_rate": 2.672963924756908e-07, "logits/chosen": -1.5970592498779297, "logits/rejected": -1.3868812322616577, "logps/chosen": -663.7234497070312, "logps/rejected": -887.748046875, "loss": 0.5476, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.09576416015625, "rewards/margins": 2.107668876647949, "rewards/rejected": -6.203433036804199, "step": 13240 }, { "epoch": 0.87, "grad_norm": 32.0, "learning_rate": 2.6473317196319846e-07, "logits/chosen": -1.4853492975234985, "logits/rejected": -1.4623899459838867, "logps/chosen": -660.8912353515625, "logps/rejected": -798.3517456054688, "loss": 0.567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.109341621398926, "rewards/margins": 1.6717054843902588, "rewards/rejected": -5.7810468673706055, "step": 13250 }, { "epoch": 0.87, "grad_norm": 21.75, "learning_rate": 2.6218161339304273e-07, "logits/chosen": -1.7166149616241455, "logits/rejected": -1.2827997207641602, "logps/chosen": -606.2095947265625, "logps/rejected": -845.9509887695312, "loss": 0.3132, "rewards/accuracies": 0.875, "rewards/chosen": -3.506187915802002, "rewards/margins": 2.379061698913574, "rewards/rejected": -5.885249614715576, "step": 13260 }, { "epoch": 0.87, "grad_norm": 22.625, "learning_rate": 2.596417300773385e-07, "logits/chosen": -1.3408167362213135, "logits/rejected": -1.0707741975784302, "logps/chosen": -649.1927490234375, "logps/rejected": -805.7384643554688, "loss": 0.5711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.824604034423828, "rewards/margins": 1.758058786392212, "rewards/rejected": -5.582662105560303, "step": 13270 }, { "epoch": 0.87, "grad_norm": 6.28125, "learning_rate": 2.571135352672871e-07, "logits/chosen": -1.5792495012283325, "logits/rejected": -1.2226645946502686, "logps/chosen": -731.9813232421875, "logps/rejected": -863.3533935546875, "loss": 0.389, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.068902015686035, "rewards/margins": 1.9599024057388306, "rewards/rejected": -6.028803825378418, "step": 13280 }, { "epoch": 0.87, "grad_norm": 46.25, "learning_rate": 2.5459704215310966e-07, "logits/chosen": -1.24709951877594, "logits/rejected": -1.3801298141479492, "logps/chosen": -609.868408203125, "logps/rejected": -817.7156982421875, "loss": 0.6905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.646284818649292, "rewards/margins": 2.018608570098877, "rewards/rejected": -5.66489315032959, "step": 13290 }, { "epoch": 0.87, "grad_norm": 19.125, "learning_rate": 2.5209226386397613e-07, "logits/chosen": -1.5874335765838623, "logits/rejected": -1.2782566547393799, "logps/chosen": -711.1026611328125, "logps/rejected": -850.4737548828125, "loss": 0.3055, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.5862908363342285, "rewards/margins": 2.1739068031311035, "rewards/rejected": -5.760197639465332, "step": 13300 }, { "epoch": 0.87, "eval_logits/chosen": -1.5057016611099243, "eval_logits/rejected": -1.1420787572860718, "eval_logps/chosen": -668.9513549804688, "eval_logps/rejected": -828.0081176757812, "eval_loss": 0.5150899887084961, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -4.043317794799805, "eval_rewards/margins": 1.7910466194152832, "eval_rewards/rejected": -5.834364414215088, "eval_runtime": 1082.767, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 13300 }, { "epoch": 0.87, "grad_norm": 25.25, "learning_rate": 2.495992134679365e-07, "logits/chosen": -1.4658201932907104, "logits/rejected": -1.1204164028167725, "logps/chosen": -666.6136474609375, "logps/rejected": -792.7175903320312, "loss": 0.4368, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.221998691558838, "rewards/margins": 1.6347713470458984, "rewards/rejected": -5.856769561767578, "step": 13310 }, { "epoch": 0.87, "grad_norm": 34.75, "learning_rate": 2.4711790397185407e-07, "logits/chosen": -1.320035457611084, "logits/rejected": -1.3589204549789429, "logps/chosen": -663.8743896484375, "logps/rejected": -865.33154296875, "loss": 0.6001, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.977898120880127, "rewards/margins": 1.9222595691680908, "rewards/rejected": -5.900157451629639, "step": 13320 }, { "epoch": 0.87, "grad_norm": 53.25, "learning_rate": 2.4464834832133714e-07, "logits/chosen": -1.5679186582565308, "logits/rejected": -1.083113431930542, "logps/chosen": -685.388671875, "logps/rejected": -786.59228515625, "loss": 0.4727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.036313533782959, "rewards/margins": 1.435438871383667, "rewards/rejected": -5.471752643585205, "step": 13330 }, { "epoch": 0.87, "grad_norm": 31.75, "learning_rate": 2.4219055940067156e-07, "logits/chosen": -1.5473520755767822, "logits/rejected": -1.1591984033584595, "logps/chosen": -696.2213134765625, "logps/rejected": -799.7785034179688, "loss": 0.6862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.19878625869751, "rewards/margins": 1.4168941974639893, "rewards/rejected": -5.615680694580078, "step": 13340 }, { "epoch": 0.87, "grad_norm": 54.25, "learning_rate": 2.3974455003275137e-07, "logits/chosen": -1.1065648794174194, "logits/rejected": -1.1033493280410767, "logps/chosen": -661.9840087890625, "logps/rejected": -979.50341796875, "loss": 0.3502, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.3227314949035645, "rewards/margins": 2.794921636581421, "rewards/rejected": -7.117652893066406, "step": 13350 }, { "epoch": 0.87, "grad_norm": 27.5, "learning_rate": 2.3731033297901663e-07, "logits/chosen": -1.4506008625030518, "logits/rejected": -1.3826196193695068, "logps/chosen": -680.720947265625, "logps/rejected": -796.3009643554688, "loss": 0.6796, "rewards/accuracies": 0.75, "rewards/chosen": -4.54086446762085, "rewards/margins": 1.0152584314346313, "rewards/rejected": -5.556122779846191, "step": 13360 }, { "epoch": 0.87, "grad_norm": 17.625, "learning_rate": 2.3488792093938117e-07, "logits/chosen": -1.3902013301849365, "logits/rejected": -0.7046431303024292, "logps/chosen": -711.5452880859375, "logps/rejected": -924.5979614257812, "loss": 0.3344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.040633201599121, "rewards/margins": 2.7878708839416504, "rewards/rejected": -6.8285040855407715, "step": 13370 }, { "epoch": 0.88, "grad_norm": 6.25, "learning_rate": 2.324773265521707e-07, "logits/chosen": -1.4417166709899902, "logits/rejected": -0.8342768549919128, "logps/chosen": -723.4376220703125, "logps/rejected": -903.7635498046875, "loss": 0.4479, "rewards/accuracies": 0.75, "rewards/chosen": -4.238625526428223, "rewards/margins": 2.105958938598633, "rewards/rejected": -6.3445844650268555, "step": 13380 }, { "epoch": 0.88, "grad_norm": 30.5, "learning_rate": 2.30078562394056e-07, "logits/chosen": -1.7574183940887451, "logits/rejected": -1.2943403720855713, "logps/chosen": -627.895263671875, "logps/rejected": -834.0563354492188, "loss": 0.6094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.116473197937012, "rewards/margins": 2.081397533416748, "rewards/rejected": -6.197870254516602, "step": 13390 }, { "epoch": 0.88, "grad_norm": 54.5, "learning_rate": 2.2769164097998397e-07, "logits/chosen": -1.4689090251922607, "logits/rejected": -0.9231687784194946, "logps/chosen": -680.343505859375, "logps/rejected": -744.614013671875, "loss": 0.6737, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.034160614013672, "rewards/margins": 1.561659812927246, "rewards/rejected": -5.595820426940918, "step": 13400 }, { "epoch": 0.88, "eval_logits/chosen": -1.5060392618179321, "eval_logits/rejected": -1.1420257091522217, "eval_logps/chosen": -669.100341796875, "eval_logps/rejected": -828.0372314453125, "eval_loss": 0.5151059627532959, "eval_rewards/accuracies": 0.7505000233650208, "eval_rewards/chosen": -4.044806957244873, "eval_rewards/margins": 1.7898489236831665, "eval_rewards/rejected": -5.83465576171875, "eval_runtime": 1082.628, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 13400 }, { "epoch": 0.88, "grad_norm": 32.75, "learning_rate": 2.2531657476311752e-07, "logits/chosen": -1.6937446594238281, "logits/rejected": -1.2724541425704956, "logps/chosen": -691.892822265625, "logps/rejected": -912.0023193359375, "loss": 0.5247, "rewards/accuracies": 0.75, "rewards/chosen": -4.256954669952393, "rewards/margins": 2.130265474319458, "rewards/rejected": -6.3872199058532715, "step": 13410 }, { "epoch": 0.88, "grad_norm": 7.6875, "learning_rate": 2.2295337613476714e-07, "logits/chosen": -1.2615830898284912, "logits/rejected": -0.4728579521179199, "logps/chosen": -726.5817260742188, "logps/rejected": -928.9802856445312, "loss": 0.3396, "rewards/accuracies": 0.875, "rewards/chosen": -4.107261657714844, "rewards/margins": 2.6031641960144043, "rewards/rejected": -6.710426330566406, "step": 13420 }, { "epoch": 0.88, "grad_norm": 232.0, "learning_rate": 2.2060205742432727e-07, "logits/chosen": -1.263390302658081, "logits/rejected": -0.8675936460494995, "logps/chosen": -664.7837524414062, "logps/rejected": -827.3263549804688, "loss": 0.4806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.225018501281738, "rewards/margins": 2.007542133331299, "rewards/rejected": -6.2325615882873535, "step": 13430 }, { "epoch": 0.88, "grad_norm": 30.75, "learning_rate": 2.1826263089921114e-07, "logits/chosen": -1.0940314531326294, "logits/rejected": -0.2232934683561325, "logps/chosen": -599.1146850585938, "logps/rejected": -826.55517578125, "loss": 0.434, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.033860206604004, "rewards/margins": 2.1035284996032715, "rewards/rejected": -6.137389183044434, "step": 13440 }, { "epoch": 0.88, "grad_norm": 54.25, "learning_rate": 2.159351087647882e-07, "logits/chosen": -1.6125357151031494, "logits/rejected": -0.7487273812294006, "logps/chosen": -699.174560546875, "logps/rejected": -753.7546997070312, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": -3.969196319580078, "rewards/margins": 1.4298175573349, "rewards/rejected": -5.399013996124268, "step": 13450 }, { "epoch": 0.88, "grad_norm": 7.8125, "learning_rate": 2.1361950316432013e-07, "logits/chosen": -1.5927923917770386, "logits/rejected": -0.8266812562942505, "logps/chosen": -669.3335571289062, "logps/rejected": -757.3978881835938, "loss": 0.4428, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.876870632171631, "rewards/margins": 1.5809967517852783, "rewards/rejected": -5.45786714553833, "step": 13460 }, { "epoch": 0.88, "grad_norm": 42.75, "learning_rate": 2.113158261788964e-07, "logits/chosen": -1.41665518283844, "logits/rejected": -1.7775453329086304, "logps/chosen": -613.8785400390625, "logps/rejected": -788.5238647460938, "loss": 0.5438, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.9429943561553955, "rewards/margins": 1.582007884979248, "rewards/rejected": -5.525002479553223, "step": 13470 }, { "epoch": 0.88, "grad_norm": 5.0625, "learning_rate": 2.0902408982737182e-07, "logits/chosen": -1.5921553373336792, "logits/rejected": -1.50002920627594, "logps/chosen": -663.9389038085938, "logps/rejected": -783.9034423828125, "loss": 0.4961, "rewards/accuracies": 0.75, "rewards/chosen": -3.934406280517578, "rewards/margins": 1.5539488792419434, "rewards/rejected": -5.4883551597595215, "step": 13480 }, { "epoch": 0.88, "grad_norm": 10.9375, "learning_rate": 2.067443060663052e-07, "logits/chosen": -1.2717373371124268, "logits/rejected": -1.0100377798080444, "logps/chosen": -713.4901123046875, "logps/rejected": -842.705078125, "loss": 0.605, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.189897060394287, "rewards/margins": 1.575904369354248, "rewards/rejected": -5.765802383422852, "step": 13490 }, { "epoch": 0.88, "grad_norm": 11.5625, "learning_rate": 2.044764867898935e-07, "logits/chosen": -1.4911818504333496, "logits/rejected": -1.3408533334732056, "logps/chosen": -612.7347412109375, "logps/rejected": -843.314453125, "loss": 0.3819, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.8133444786071777, "rewards/margins": 2.2788941860198975, "rewards/rejected": -6.092238903045654, "step": 13500 }, { "epoch": 0.88, "eval_logits/chosen": -1.503798246383667, "eval_logits/rejected": -1.139869213104248, "eval_logps/chosen": -670.1139526367188, "eval_logps/rejected": -829.2462158203125, "eval_loss": 0.5151130557060242, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.054943084716797, "eval_rewards/margins": 1.7918022871017456, "eval_rewards/rejected": -5.846745014190674, "eval_runtime": 1082.6501, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 13500 }, { "epoch": 0.88, "grad_norm": 9.4375, "learning_rate": 2.02220643829914e-07, "logits/chosen": -1.2649977207183838, "logits/rejected": -0.9876346588134766, "logps/chosen": -574.664794921875, "logps/rejected": -813.8932495117188, "loss": 0.3635, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5965163707733154, "rewards/margins": 2.4615540504455566, "rewards/rejected": -6.058070659637451, "step": 13510 }, { "epoch": 0.88, "grad_norm": 12.9375, "learning_rate": 1.9997678895566002e-07, "logits/chosen": -1.558394432067871, "logits/rejected": -0.7474314570426941, "logps/chosen": -654.3262939453125, "logps/rejected": -837.9288330078125, "loss": 0.4237, "rewards/accuracies": 0.75, "rewards/chosen": -4.253752708435059, "rewards/margins": 1.924399971961975, "rewards/rejected": -6.178152561187744, "step": 13520 }, { "epoch": 0.89, "grad_norm": 35.0, "learning_rate": 1.9774493387387904e-07, "logits/chosen": -1.2161718606948853, "logits/rejected": -0.5709174871444702, "logps/chosen": -748.6055297851562, "logps/rejected": -914.0421752929688, "loss": 0.3837, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.335314750671387, "rewards/margins": 2.1836130619049072, "rewards/rejected": -6.518927097320557, "step": 13530 }, { "epoch": 0.89, "grad_norm": 3.3125, "learning_rate": 1.9552509022871426e-07, "logits/chosen": -1.747867226600647, "logits/rejected": -0.7846584916114807, "logps/chosen": -622.5896606445312, "logps/rejected": -856.0773315429688, "loss": 0.3237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4918646812438965, "rewards/margins": 2.902956247329712, "rewards/rejected": -6.394820690155029, "step": 13540 }, { "epoch": 0.89, "grad_norm": 16.0, "learning_rate": 1.9331726960164137e-07, "logits/chosen": -1.6589272022247314, "logits/rejected": -1.4952062368392944, "logps/chosen": -685.5469970703125, "logps/rejected": -824.7197265625, "loss": 0.4131, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.638629913330078, "rewards/margins": 2.064333200454712, "rewards/rejected": -5.702963352203369, "step": 13550 }, { "epoch": 0.89, "grad_norm": 69.0, "learning_rate": 1.9112148351140836e-07, "logits/chosen": -1.3866403102874756, "logits/rejected": -1.2627406120300293, "logps/chosen": -626.8496704101562, "logps/rejected": -840.4171142578125, "loss": 0.5636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.203787326812744, "rewards/margins": 1.9808582067489624, "rewards/rejected": -6.184645652770996, "step": 13560 }, { "epoch": 0.89, "grad_norm": 40.25, "learning_rate": 1.889377434139769e-07, "logits/chosen": -1.7470375299453735, "logits/rejected": -0.707180917263031, "logps/chosen": -706.8977661132812, "logps/rejected": -781.4537963867188, "loss": 0.3671, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.1290974617004395, "rewards/margins": 1.8900716304779053, "rewards/rejected": -6.019169330596924, "step": 13570 }, { "epoch": 0.89, "grad_norm": 9.9375, "learning_rate": 1.8676606070246266e-07, "logits/chosen": -0.8908470273017883, "logits/rejected": -0.46409597992897034, "logps/chosen": -670.129638671875, "logps/rejected": -895.7252197265625, "loss": 0.4778, "rewards/accuracies": 0.75, "rewards/chosen": -4.010275840759277, "rewards/margins": 2.408250093460083, "rewards/rejected": -6.418524742126465, "step": 13580 }, { "epoch": 0.89, "grad_norm": 22.875, "learning_rate": 1.8460644670707267e-07, "logits/chosen": -1.5870654582977295, "logits/rejected": -1.4287662506103516, "logps/chosen": -637.0712280273438, "logps/rejected": -821.4818115234375, "loss": 0.4906, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8454291820526123, "rewards/margins": 1.8523069620132446, "rewards/rejected": -5.6977362632751465, "step": 13590 }, { "epoch": 0.89, "grad_norm": 33.25, "learning_rate": 1.8245891269505017e-07, "logits/chosen": -1.5936305522918701, "logits/rejected": -1.3783609867095947, "logps/chosen": -600.5499877929688, "logps/rejected": -758.1959228515625, "loss": 0.8034, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.8712029457092285, "rewards/margins": 1.3054590225219727, "rewards/rejected": -5.176661968231201, "step": 13600 }, { "epoch": 0.89, "eval_logits/chosen": -1.5017755031585693, "eval_logits/rejected": -1.136734127998352, "eval_logps/chosen": -670.9915161132812, "eval_logps/rejected": -830.4300537109375, "eval_loss": 0.5154497027397156, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.063719272613525, "eval_rewards/margins": 1.7948648929595947, "eval_rewards/rejected": -5.858583927154541, "eval_runtime": 1082.7845, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 13600 }, { "epoch": 0.89, "grad_norm": 11.0625, "learning_rate": 1.8032346987061384e-07, "logits/chosen": -1.4501222372055054, "logits/rejected": -1.193864107131958, "logps/chosen": -616.0745849609375, "logps/rejected": -798.6109619140625, "loss": 0.5001, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.891310930252075, "rewards/margins": 1.7338253259658813, "rewards/rejected": -5.625136375427246, "step": 13610 }, { "epoch": 0.89, "grad_norm": 58.25, "learning_rate": 1.7820012937489833e-07, "logits/chosen": -1.3723113536834717, "logits/rejected": -0.8261283040046692, "logps/chosen": -683.1546630859375, "logps/rejected": -930.6246337890625, "loss": 0.3906, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.237913131713867, "rewards/margins": 2.705251693725586, "rewards/rejected": -6.943164825439453, "step": 13620 }, { "epoch": 0.89, "grad_norm": 26.375, "learning_rate": 1.760889022858997e-07, "logits/chosen": -1.5946484804153442, "logits/rejected": -1.4808439016342163, "logps/chosen": -579.5208129882812, "logps/rejected": -802.6704711914062, "loss": 0.4162, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.433154582977295, "rewards/margins": 2.3784022331237793, "rewards/rejected": -5.811556816101074, "step": 13630 }, { "epoch": 0.89, "grad_norm": 9.6875, "learning_rate": 1.7398979961841367e-07, "logits/chosen": -1.1434919834136963, "logits/rejected": -1.0144751071929932, "logps/chosen": -728.9832153320312, "logps/rejected": -856.5364990234375, "loss": 0.4893, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.668473720550537, "rewards/margins": 1.4198487997055054, "rewards/rejected": -6.088322162628174, "step": 13640 }, { "epoch": 0.89, "grad_norm": 36.5, "learning_rate": 1.719028323239802e-07, "logits/chosen": -1.3033682107925415, "logits/rejected": -0.7145559191703796, "logps/chosen": -585.498046875, "logps/rejected": -830.09619140625, "loss": 0.2935, "rewards/accuracies": 0.875, "rewards/chosen": -3.7618918418884277, "rewards/margins": 2.399820327758789, "rewards/rejected": -6.161712169647217, "step": 13650 }, { "epoch": 0.89, "grad_norm": 18.625, "learning_rate": 1.69828011290826e-07, "logits/chosen": -1.6054388284683228, "logits/rejected": -1.0840543508529663, "logps/chosen": -657.7322387695312, "logps/rejected": -837.5631713867188, "loss": 0.36, "rewards/accuracies": 0.75, "rewards/chosen": -3.597449779510498, "rewards/margins": 2.2874674797058105, "rewards/rejected": -5.884917259216309, "step": 13660 }, { "epoch": 0.89, "grad_norm": 16.25, "learning_rate": 1.6776534734380817e-07, "logits/chosen": -1.302690863609314, "logits/rejected": -1.1003791093826294, "logps/chosen": -627.6024780273438, "logps/rejected": -921.8064575195312, "loss": 0.401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.8639464378356934, "rewards/margins": 2.6079280376434326, "rewards/rejected": -6.471875190734863, "step": 13670 }, { "epoch": 0.9, "grad_norm": 11.75, "learning_rate": 1.6571485124435682e-07, "logits/chosen": -1.034483551979065, "logits/rejected": -1.2488367557525635, "logps/chosen": -657.0856323242188, "logps/rejected": -886.8914184570312, "loss": 0.3234, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.180108070373535, "rewards/margins": 2.501439332962036, "rewards/rejected": -6.68154764175415, "step": 13680 }, { "epoch": 0.9, "grad_norm": 6.4375, "learning_rate": 1.6367653369041946e-07, "logits/chosen": -1.3765370845794678, "logits/rejected": -1.2604020833969116, "logps/chosen": -709.4337158203125, "logps/rejected": -924.5352783203125, "loss": 0.3614, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.161559104919434, "rewards/margins": 2.3202595710754395, "rewards/rejected": -6.481818199157715, "step": 13690 }, { "epoch": 0.9, "grad_norm": 17.875, "learning_rate": 1.6165040531640557e-07, "logits/chosen": -1.379528522491455, "logits/rejected": -0.9827286601066589, "logps/chosen": -699.6856689453125, "logps/rejected": -906.9583740234375, "loss": 0.4371, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.22186803817749, "rewards/margins": 2.2486655712127686, "rewards/rejected": -6.470534324645996, "step": 13700 }, { "epoch": 0.9, "eval_logits/chosen": -1.4990501403808594, "eval_logits/rejected": -1.1337603330612183, "eval_logps/chosen": -672.57666015625, "eval_logps/rejected": -832.36083984375, "eval_loss": 0.5156986117362976, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -4.07957124710083, "eval_rewards/margins": 1.7983202934265137, "eval_rewards/rejected": -5.8778910636901855, "eval_runtime": 1082.6551, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 13700 }, { "epoch": 0.9, "grad_norm": 27.25, "learning_rate": 1.5963647669312938e-07, "logits/chosen": -1.732709527015686, "logits/rejected": -0.9419218301773071, "logps/chosen": -704.6703491210938, "logps/rejected": -799.585205078125, "loss": 0.4935, "rewards/accuracies": 0.75, "rewards/chosen": -4.0426554679870605, "rewards/margins": 1.718501091003418, "rewards/rejected": -5.761157035827637, "step": 13710 }, { "epoch": 0.9, "grad_norm": 9.8125, "learning_rate": 1.5763475832775738e-07, "logits/chosen": -1.5491251945495605, "logits/rejected": -1.1623218059539795, "logps/chosen": -555.3155517578125, "logps/rejected": -793.0640869140625, "loss": 0.2606, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1718637943267822, "rewards/margins": 2.625633716583252, "rewards/rejected": -5.797497749328613, "step": 13720 }, { "epoch": 0.9, "grad_norm": 6.28125, "learning_rate": 1.5564526066375151e-07, "logits/chosen": -1.3904610872268677, "logits/rejected": -1.2077701091766357, "logps/chosen": -639.2657470703125, "logps/rejected": -781.9466552734375, "loss": 0.6124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.028054714202881, "rewards/margins": 1.3291046619415283, "rewards/rejected": -5.35715913772583, "step": 13730 }, { "epoch": 0.9, "grad_norm": 14.25, "learning_rate": 1.5366799408081557e-07, "logits/chosen": -1.7086114883422852, "logits/rejected": -1.0808404684066772, "logps/chosen": -688.7163696289062, "logps/rejected": -734.1798095703125, "loss": 0.5955, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.7744204998016357, "rewards/margins": 1.3673133850097656, "rewards/rejected": -5.1417341232299805, "step": 13740 }, { "epoch": 0.9, "grad_norm": 18.5, "learning_rate": 1.5170296889483987e-07, "logits/chosen": -1.3724133968353271, "logits/rejected": -1.2909491062164307, "logps/chosen": -691.4888305664062, "logps/rejected": -899.8841552734375, "loss": 0.4028, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.3253254890441895, "rewards/margins": 1.9758689403533936, "rewards/rejected": -6.301194667816162, "step": 13750 }, { "epoch": 0.9, "grad_norm": 26.0, "learning_rate": 1.4975019535784923e-07, "logits/chosen": -1.5853593349456787, "logits/rejected": -1.3581726551055908, "logps/chosen": -652.9265747070312, "logps/rejected": -802.5459594726562, "loss": 0.5154, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.011972427368164, "rewards/margins": 1.5304895639419556, "rewards/rejected": -5.54246187210083, "step": 13760 }, { "epoch": 0.9, "grad_norm": 13.3125, "learning_rate": 1.4780968365794874e-07, "logits/chosen": -1.3218164443969727, "logits/rejected": -1.148216962814331, "logps/chosen": -736.9437866210938, "logps/rejected": -876.6680908203125, "loss": 0.5922, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.550312042236328, "rewards/margins": 1.2696870565414429, "rewards/rejected": -5.8199992179870605, "step": 13770 }, { "epoch": 0.9, "grad_norm": 8.8125, "learning_rate": 1.4588144391926935e-07, "logits/chosen": -1.437699556350708, "logits/rejected": -0.789788007736206, "logps/chosen": -688.605712890625, "logps/rejected": -850.5232543945312, "loss": 0.5018, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.181762218475342, "rewards/margins": 2.1112279891967773, "rewards/rejected": -6.292990684509277, "step": 13780 }, { "epoch": 0.9, "grad_norm": 19.5, "learning_rate": 1.4396548620191714e-07, "logits/chosen": -1.4717270135879517, "logits/rejected": -0.758381724357605, "logps/chosen": -662.2940673828125, "logps/rejected": -819.8983154296875, "loss": 0.5296, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.205971717834473, "rewards/margins": 1.776846170425415, "rewards/rejected": -5.982817649841309, "step": 13790 }, { "epoch": 0.9, "grad_norm": 36.5, "learning_rate": 1.4206182050191946e-07, "logits/chosen": -1.6988931894302368, "logits/rejected": -1.4125220775604248, "logps/chosen": -628.3203735351562, "logps/rejected": -810.7428588867188, "loss": 0.3428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5636985301971436, "rewards/margins": 2.1370623111724854, "rewards/rejected": -5.700760841369629, "step": 13800 }, { "epoch": 0.9, "eval_logits/chosen": -1.5000897645950317, "eval_logits/rejected": -1.1346989870071411, "eval_logps/chosen": -672.1581420898438, "eval_logps/rejected": -831.8970336914062, "eval_loss": 0.5154798030853271, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -4.075385570526123, "eval_rewards/margins": 1.7978686094284058, "eval_rewards/rejected": -5.873253345489502, "eval_runtime": 1082.6586, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 13800 }, { "epoch": 0.9, "grad_norm": 43.5, "learning_rate": 1.4017045675117301e-07, "logits/chosen": -1.6317275762557983, "logits/rejected": -1.068319320678711, "logps/chosen": -690.3223266601562, "logps/rejected": -892.5054931640625, "loss": 0.4482, "rewards/accuracies": 0.875, "rewards/chosen": -4.000103950500488, "rewards/margins": 2.410576820373535, "rewards/rejected": -6.410679817199707, "step": 13810 }, { "epoch": 0.9, "grad_norm": 28.125, "learning_rate": 1.382914048173928e-07, "logits/chosen": -1.7365009784698486, "logits/rejected": -1.3811695575714111, "logps/chosen": -575.8255615234375, "logps/rejected": -744.5897216796875, "loss": 0.5562, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.8202781677246094, "rewards/margins": 1.6877740621566772, "rewards/rejected": -5.508051872253418, "step": 13820 }, { "epoch": 0.9, "grad_norm": 57.75, "learning_rate": 1.3642467450405955e-07, "logits/chosen": -1.421283483505249, "logits/rejected": -1.4586012363433838, "logps/chosen": -679.3118896484375, "logps/rejected": -817.9019165039062, "loss": 0.5691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.9231464862823486, "rewards/margins": 1.6027956008911133, "rewards/rejected": -5.525942802429199, "step": 13830 }, { "epoch": 0.91, "grad_norm": 25.75, "learning_rate": 1.3457027555036832e-07, "logits/chosen": -1.7454506158828735, "logits/rejected": -0.7976868152618408, "logps/chosen": -611.3190307617188, "logps/rejected": -900.8497924804688, "loss": 0.2923, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.799614429473877, "rewards/margins": 2.86373233795166, "rewards/rejected": -6.663346767425537, "step": 13840 }, { "epoch": 0.91, "grad_norm": 12.1875, "learning_rate": 1.3272821763117948e-07, "logits/chosen": -1.2431375980377197, "logits/rejected": -1.091731309890747, "logps/chosen": -604.5424194335938, "logps/rejected": -768.3635864257812, "loss": 0.4867, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.0526323318481445, "rewards/margins": 1.5794814825057983, "rewards/rejected": -5.632113456726074, "step": 13850 }, { "epoch": 0.91, "grad_norm": 27.875, "learning_rate": 1.3089851035696738e-07, "logits/chosen": -1.265124797821045, "logits/rejected": -1.3109161853790283, "logps/chosen": -664.4851684570312, "logps/rejected": -857.4478759765625, "loss": 0.6826, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.4117841720581055, "rewards/margins": 1.7559242248535156, "rewards/rejected": -6.167708396911621, "step": 13860 }, { "epoch": 0.91, "grad_norm": 55.5, "learning_rate": 1.2908116327376875e-07, "logits/chosen": -1.4682629108428955, "logits/rejected": -1.1264333724975586, "logps/chosen": -644.4199829101562, "logps/rejected": -730.1370849609375, "loss": 0.678, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.9818947315216064, "rewards/margins": 1.0441501140594482, "rewards/rejected": -5.026045799255371, "step": 13870 }, { "epoch": 0.91, "grad_norm": 18.75, "learning_rate": 1.2727618586313495e-07, "logits/chosen": -2.0166523456573486, "logits/rejected": -1.0816282033920288, "logps/chosen": -742.3630981445312, "logps/rejected": -846.6320190429688, "loss": 0.4771, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.2850260734558105, "rewards/margins": 1.653031349182129, "rewards/rejected": -5.938057899475098, "step": 13880 }, { "epoch": 0.91, "grad_norm": 35.0, "learning_rate": 1.254835875420818e-07, "logits/chosen": -1.5131808519363403, "logits/rejected": -0.9916925430297852, "logps/chosen": -587.3331298828125, "logps/rejected": -696.5231323242188, "loss": 0.6767, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.803973436355591, "rewards/margins": 1.4406182765960693, "rewards/rejected": -5.24459171295166, "step": 13890 }, { "epoch": 0.91, "grad_norm": 47.25, "learning_rate": 1.237033776630392e-07, "logits/chosen": -1.344658613204956, "logits/rejected": -1.2246429920196533, "logps/chosen": -698.9102783203125, "logps/rejected": -868.4714965820312, "loss": 0.5029, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.398816108703613, "rewards/margins": 1.9423103332519531, "rewards/rejected": -6.341126441955566, "step": 13900 }, { "epoch": 0.91, "eval_logits/chosen": -1.5004322528839111, "eval_logits/rejected": -1.1350605487823486, "eval_logps/chosen": -671.9635009765625, "eval_logps/rejected": -831.66162109375, "eval_loss": 0.5156022906303406, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -4.073439598083496, "eval_rewards/margins": 1.7974597215652466, "eval_rewards/rejected": -5.870899200439453, "eval_runtime": 1082.6573, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 13900 }, { "epoch": 0.91, "grad_norm": 22.5, "learning_rate": 1.219355655138052e-07, "logits/chosen": -1.6186354160308838, "logits/rejected": -1.121442198753357, "logps/chosen": -647.9474487304688, "logps/rejected": -815.1851196289062, "loss": 0.4113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7660961151123047, "rewards/margins": 1.8905839920043945, "rewards/rejected": -5.656679630279541, "step": 13910 }, { "epoch": 0.91, "grad_norm": 18.5, "learning_rate": 1.2018016031749514e-07, "logits/chosen": -1.3509947061538696, "logits/rejected": -1.3100658655166626, "logps/chosen": -596.5263061523438, "logps/rejected": -793.6414184570312, "loss": 0.4399, "rewards/accuracies": 0.875, "rewards/chosen": -3.7531466484069824, "rewards/margins": 1.7230043411254883, "rewards/rejected": -5.476150989532471, "step": 13920 }, { "epoch": 0.91, "grad_norm": 34.75, "learning_rate": 1.1843717123249331e-07, "logits/chosen": -1.5992892980575562, "logits/rejected": -1.1540720462799072, "logps/chosen": -733.0587158203125, "logps/rejected": -846.4431762695312, "loss": 0.4506, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.0365729331970215, "rewards/margins": 1.8907638788223267, "rewards/rejected": -5.9273362159729, "step": 13930 }, { "epoch": 0.91, "grad_norm": 33.25, "learning_rate": 1.1670660735240702e-07, "logits/chosen": -1.6351385116577148, "logits/rejected": -1.0813556909561157, "logps/chosen": -672.4617919921875, "logps/rejected": -799.3289184570312, "loss": 0.5935, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.023453235626221, "rewards/margins": 1.3555313348770142, "rewards/rejected": -5.378984451293945, "step": 13940 }, { "epoch": 0.91, "grad_norm": 15.25, "learning_rate": 1.1498847770601839e-07, "logits/chosen": -1.2962348461151123, "logits/rejected": -1.5063269138336182, "logps/chosen": -634.4072265625, "logps/rejected": -899.046875, "loss": 0.3774, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.642983913421631, "rewards/margins": 2.316915512084961, "rewards/rejected": -5.959898948669434, "step": 13950 }, { "epoch": 0.91, "grad_norm": 5.3125, "learning_rate": 1.1328279125723568e-07, "logits/chosen": -1.5751912593841553, "logits/rejected": -1.190748929977417, "logps/chosen": -667.0145263671875, "logps/rejected": -848.5634765625, "loss": 0.4118, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.6986212730407715, "rewards/margins": 1.8952213525772095, "rewards/rejected": -5.593842506408691, "step": 13960 }, { "epoch": 0.91, "grad_norm": 2.328125, "learning_rate": 1.1158955690504958e-07, "logits/chosen": -1.5940449237823486, "logits/rejected": -1.6255643367767334, "logps/chosen": -648.8529663085938, "logps/rejected": -842.8118896484375, "loss": 0.5916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.9715182781219482, "rewards/margins": 1.7030658721923828, "rewards/rejected": -5.67458438873291, "step": 13970 }, { "epoch": 0.91, "grad_norm": 13.25, "learning_rate": 1.0990878348348416e-07, "logits/chosen": -1.4499988555908203, "logits/rejected": -1.1313774585723877, "logps/chosen": -620.6799926757812, "logps/rejected": -846.0452880859375, "loss": 0.5655, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.9392647743225098, "rewards/margins": 2.385338068008423, "rewards/rejected": -6.3246026039123535, "step": 13980 }, { "epoch": 0.92, "grad_norm": 54.75, "learning_rate": 1.0824047976155189e-07, "logits/chosen": -1.353087306022644, "logits/rejected": -1.4823195934295654, "logps/chosen": -665.24365234375, "logps/rejected": -772.1153564453125, "loss": 0.6029, "rewards/accuracies": 0.75, "rewards/chosen": -4.0325927734375, "rewards/margins": 1.184091329574585, "rewards/rejected": -5.216683864593506, "step": 13990 }, { "epoch": 0.92, "grad_norm": 14.9375, "learning_rate": 1.0658465444320754e-07, "logits/chosen": -1.97750985622406, "logits/rejected": -1.2648087739944458, "logps/chosen": -650.1311645507812, "logps/rejected": -779.7244873046875, "loss": 0.5905, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.928241729736328, "rewards/margins": 1.7251157760620117, "rewards/rejected": -5.65335750579834, "step": 14000 }, { "epoch": 0.92, "eval_logits/chosen": -1.4997376203536987, "eval_logits/rejected": -1.1344692707061768, "eval_logps/chosen": -672.219970703125, "eval_logps/rejected": -831.9776611328125, "eval_loss": 0.5154688358306885, "eval_rewards/accuracies": 0.7524999976158142, "eval_rewards/chosen": -4.076004505157471, "eval_rewards/margins": 1.7980562448501587, "eval_rewards/rejected": -5.87406063079834, "eval_runtime": 1082.6416, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 14000 }, { "epoch": 0.92, "grad_norm": 10.9375, "learning_rate": 1.0494131616730324e-07, "logits/chosen": -1.5466485023498535, "logits/rejected": -1.141755223274231, "logps/chosen": -604.1151733398438, "logps/rejected": -739.9758911132812, "loss": 0.5925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.083165168762207, "rewards/margins": 1.2194801568984985, "rewards/rejected": -5.302645683288574, "step": 14010 }, { "epoch": 0.92, "grad_norm": 20.5, "learning_rate": 1.0331047350754325e-07, "logits/chosen": -1.0905673503875732, "logits/rejected": -1.3011325597763062, "logps/chosen": -714.4012451171875, "logps/rejected": -884.0778198242188, "loss": 0.7343, "rewards/accuracies": 0.75, "rewards/chosen": -4.392796993255615, "rewards/margins": 1.8580167293548584, "rewards/rejected": -6.250814914703369, "step": 14020 }, { "epoch": 0.92, "grad_norm": 18.375, "learning_rate": 1.016921349724384e-07, "logits/chosen": -1.7991666793823242, "logits/rejected": -0.9868572950363159, "logps/chosen": -699.2430419921875, "logps/rejected": -858.8880615234375, "loss": 0.4952, "rewards/accuracies": 0.75, "rewards/chosen": -4.1293559074401855, "rewards/margins": 2.2269442081451416, "rewards/rejected": -6.356299877166748, "step": 14030 }, { "epoch": 0.92, "grad_norm": 11.9375, "learning_rate": 1.0008630900526367e-07, "logits/chosen": -0.8669900894165039, "logits/rejected": -0.7935361266136169, "logps/chosen": -671.0322265625, "logps/rejected": -854.3895263671875, "loss": 0.5717, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.251683235168457, "rewards/margins": 1.8141753673553467, "rewards/rejected": -6.065858364105225, "step": 14040 }, { "epoch": 0.92, "grad_norm": 22.375, "learning_rate": 9.849300398401124e-08, "logits/chosen": -1.6921088695526123, "logits/rejected": -1.6061817407608032, "logps/chosen": -598.7235107421875, "logps/rejected": -738.9817504882812, "loss": 0.5063, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.863051652908325, "rewards/margins": 1.433915138244629, "rewards/rejected": -5.296967506408691, "step": 14050 }, { "epoch": 0.92, "grad_norm": 7.15625, "learning_rate": 9.691222822134971e-08, "logits/chosen": -1.36632239818573, "logits/rejected": -1.1784974336624146, "logps/chosen": -688.5608520507812, "logps/rejected": -876.4844970703125, "loss": 0.4455, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.4029741287231445, "rewards/margins": 1.897402048110962, "rewards/rejected": -6.300376892089844, "step": 14060 }, { "epoch": 0.92, "grad_norm": 38.75, "learning_rate": 9.53439899645786e-08, "logits/chosen": -1.7060823440551758, "logits/rejected": -1.124063491821289, "logps/chosen": -643.2411499023438, "logps/rejected": -812.2536010742188, "loss": 0.2914, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.8661179542541504, "rewards/margins": 2.1024112701416016, "rewards/rejected": -5.96852970123291, "step": 14070 }, { "epoch": 0.92, "grad_norm": 49.75, "learning_rate": 9.378829739558698e-08, "logits/chosen": -1.4881768226623535, "logits/rejected": -1.42363703250885, "logps/chosen": -706.9111938476562, "logps/rejected": -844.0046997070312, "loss": 0.5618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.326229572296143, "rewards/margins": 1.7194360494613647, "rewards/rejected": -6.045665264129639, "step": 14080 }, { "epoch": 0.92, "grad_norm": 55.0, "learning_rate": 9.224515863080874e-08, "logits/chosen": -1.221100091934204, "logits/rejected": -1.284743070602417, "logps/chosen": -607.6595458984375, "logps/rejected": -809.782958984375, "loss": 0.6162, "rewards/accuracies": 0.625, "rewards/chosen": -3.9626107215881348, "rewards/margins": 1.5801469087600708, "rewards/rejected": -5.542757034301758, "step": 14090 }, { "epoch": 0.92, "grad_norm": 1.9140625, "learning_rate": 9.071458172118269e-08, "logits/chosen": -1.7279497385025024, "logits/rejected": -1.0224688053131104, "logps/chosen": -705.5316772460938, "logps/rejected": -833.7159423828125, "loss": 0.3965, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.9568352699279785, "rewards/margins": 2.136061906814575, "rewards/rejected": -6.092897415161133, "step": 14100 }, { "epoch": 0.92, "eval_logits/chosen": -1.499449372291565, "eval_logits/rejected": -1.1342741250991821, "eval_logps/chosen": -672.4373168945312, "eval_logps/rejected": -832.1840209960938, "eval_loss": 0.5156548023223877, "eval_rewards/accuracies": 0.7505000233650208, "eval_rewards/chosen": -4.078176975250244, "eval_rewards/margins": 1.7979466915130615, "eval_rewards/rejected": -5.876122951507568, "eval_runtime": 1082.6942, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 14100 }, { "epoch": 0.92, "grad_norm": 26.875, "learning_rate": 8.919657465210867e-08, "logits/chosen": -1.4488439559936523, "logits/rejected": -1.4028602838516235, "logps/chosen": -645.6329956054688, "logps/rejected": -851.7180786132812, "loss": 0.3569, "rewards/accuracies": 0.875, "rewards/chosen": -3.561392307281494, "rewards/margins": 1.8661315441131592, "rewards/rejected": -5.427524089813232, "step": 14110 }, { "epoch": 0.92, "grad_norm": 4.59375, "learning_rate": 8.769114534340623e-08, "logits/chosen": -1.7308769226074219, "logits/rejected": -0.7748288512229919, "logps/chosen": -602.2237548828125, "logps/rejected": -841.59130859375, "loss": 0.3053, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.470341920852661, "rewards/margins": 2.662076234817505, "rewards/rejected": -6.132417678833008, "step": 14120 }, { "epoch": 0.92, "grad_norm": 11.0625, "learning_rate": 8.619830164927484e-08, "logits/chosen": -1.4480222463607788, "logits/rejected": -1.0388654470443726, "logps/chosen": -749.1549072265625, "logps/rejected": -864.4166259765625, "loss": 0.51, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.525399684906006, "rewards/margins": 1.7322311401367188, "rewards/rejected": -6.257630825042725, "step": 14130 }, { "epoch": 0.93, "grad_norm": 49.75, "learning_rate": 8.4718051358251e-08, "logits/chosen": -1.4096217155456543, "logits/rejected": -1.2999186515808105, "logps/chosen": -669.1903076171875, "logps/rejected": -759.8522338867188, "loss": 0.5537, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.23410177230835, "rewards/margins": 1.3511404991149902, "rewards/rejected": -5.585242748260498, "step": 14140 }, { "epoch": 0.93, "grad_norm": 2.9375, "learning_rate": 8.32504021931671e-08, "logits/chosen": -1.4515453577041626, "logits/rejected": -1.0320031642913818, "logps/chosen": -643.5025634765625, "logps/rejected": -901.56396484375, "loss": 0.661, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.238926887512207, "rewards/margins": 2.1111388206481934, "rewards/rejected": -6.350064754486084, "step": 14150 }, { "epoch": 0.93, "grad_norm": 10.8125, "learning_rate": 8.179536181111447e-08, "logits/chosen": -1.495094656944275, "logits/rejected": -0.882555365562439, "logps/chosen": -720.068359375, "logps/rejected": -847.4959716796875, "loss": 0.5876, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.4669342041015625, "rewards/margins": 1.7935165166854858, "rewards/rejected": -6.26045036315918, "step": 14160 }, { "epoch": 0.93, "grad_norm": 16.375, "learning_rate": 8.035293780339987e-08, "logits/chosen": -1.2596606016159058, "logits/rejected": -1.4442110061645508, "logps/chosen": -619.6034545898438, "logps/rejected": -774.128173828125, "loss": 0.5518, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.0498046875, "rewards/margins": 1.265730619430542, "rewards/rejected": -5.3155341148376465, "step": 14170 }, { "epoch": 0.93, "grad_norm": 40.5, "learning_rate": 7.892313769550769e-08, "logits/chosen": -1.2903321981430054, "logits/rejected": -0.9605830907821655, "logps/chosen": -687.693359375, "logps/rejected": -979.4856567382812, "loss": 0.6908, "rewards/accuracies": 0.75, "rewards/chosen": -4.3287034034729, "rewards/margins": 2.8323934078216553, "rewards/rejected": -7.161097049713135, "step": 14180 }, { "epoch": 0.93, "grad_norm": 67.5, "learning_rate": 7.750596894706003e-08, "logits/chosen": -1.5554156303405762, "logits/rejected": -1.008979082107544, "logps/chosen": -767.0869140625, "logps/rejected": -918.0064697265625, "loss": 0.4154, "rewards/accuracies": 0.75, "rewards/chosen": -4.382875919342041, "rewards/margins": 1.9149179458618164, "rewards/rejected": -6.297793865203857, "step": 14190 }, { "epoch": 0.93, "grad_norm": 10.5625, "learning_rate": 7.610143895177891e-08, "logits/chosen": -1.2784878015518188, "logits/rejected": -0.9710628390312195, "logps/chosen": -656.9549560546875, "logps/rejected": -800.5667114257812, "loss": 0.4038, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.8789775371551514, "rewards/margins": 2.036717653274536, "rewards/rejected": -5.915695667266846, "step": 14200 }, { "epoch": 0.93, "eval_logits/chosen": -1.4993796348571777, "eval_logits/rejected": -1.1339811086654663, "eval_logps/chosen": -672.5670166015625, "eval_logps/rejected": -832.3638916015625, "eval_loss": 0.5156022310256958, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.079473972320557, "eval_rewards/margins": 1.7984479665756226, "eval_rewards/rejected": -5.877922534942627, "eval_runtime": 1082.6512, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 14200 }, { "epoch": 0.93, "grad_norm": 75.5, "learning_rate": 7.470955503744576e-08, "logits/chosen": -1.0828297138214111, "logits/rejected": -0.846405029296875, "logps/chosen": -637.0018920898438, "logps/rejected": -761.9502563476562, "loss": 0.8716, "rewards/accuracies": 0.625, "rewards/chosen": -4.158189296722412, "rewards/margins": 1.210518717765808, "rewards/rejected": -5.368708610534668, "step": 14210 }, { "epoch": 0.93, "grad_norm": 24.625, "learning_rate": 7.33303244658648e-08, "logits/chosen": -1.5756285190582275, "logits/rejected": -1.484117865562439, "logps/chosen": -630.7351684570312, "logps/rejected": -782.8128662109375, "loss": 0.523, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.003634929656982, "rewards/margins": 1.5873786211013794, "rewards/rejected": -5.5910139083862305, "step": 14220 }, { "epoch": 0.93, "grad_norm": 41.5, "learning_rate": 7.196375443282444e-08, "logits/chosen": -1.1041700839996338, "logits/rejected": -1.296731948852539, "logps/chosen": -632.415283203125, "logps/rejected": -805.6100463867188, "loss": 0.631, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.1630096435546875, "rewards/margins": 1.1999112367630005, "rewards/rejected": -5.362921237945557, "step": 14230 }, { "epoch": 0.93, "grad_norm": 18.625, "learning_rate": 7.060985206806037e-08, "logits/chosen": -1.641922950744629, "logits/rejected": -1.49126136302948, "logps/chosen": -716.67919921875, "logps/rejected": -870.072265625, "loss": 0.4555, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.282904624938965, "rewards/margins": 1.8123044967651367, "rewards/rejected": -6.09520959854126, "step": 14240 }, { "epoch": 0.93, "grad_norm": 18.375, "learning_rate": 6.926862443521754e-08, "logits/chosen": -1.1060563325881958, "logits/rejected": -1.3081772327423096, "logps/chosen": -623.88525390625, "logps/rejected": -828.1114501953125, "loss": 0.3824, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.7117676734924316, "rewards/margins": 2.2009780406951904, "rewards/rejected": -5.912745475769043, "step": 14250 }, { "epoch": 0.93, "grad_norm": 17.25, "learning_rate": 6.794007853181378e-08, "logits/chosen": -1.4524614810943604, "logits/rejected": -1.2324833869934082, "logps/chosen": -654.4378051757812, "logps/rejected": -799.3768310546875, "loss": 0.5021, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.6663215160369873, "rewards/margins": 1.7809514999389648, "rewards/rejected": -5.447274208068848, "step": 14260 }, { "epoch": 0.93, "grad_norm": 25.375, "learning_rate": 6.662422128920265e-08, "logits/chosen": -1.2171748876571655, "logits/rejected": -0.8690723180770874, "logps/chosen": -670.32080078125, "logps/rejected": -807.5103759765625, "loss": 0.5082, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.998788356781006, "rewards/margins": 1.774357795715332, "rewards/rejected": -5.773146152496338, "step": 14270 }, { "epoch": 0.93, "grad_norm": 4.4375, "learning_rate": 6.532105957253843e-08, "logits/chosen": -1.2785096168518066, "logits/rejected": -0.9676925539970398, "logps/chosen": -608.2987060546875, "logps/rejected": -829.6800537109375, "loss": 0.3142, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.8917384147644043, "rewards/margins": 2.406959295272827, "rewards/rejected": -6.298697471618652, "step": 14280 }, { "epoch": 0.93, "grad_norm": 29.5, "learning_rate": 6.403060018074003e-08, "logits/chosen": -1.7598024606704712, "logits/rejected": -0.8470717668533325, "logps/chosen": -670.87353515625, "logps/rejected": -777.6146240234375, "loss": 0.3615, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.7819228172302246, "rewards/margins": 1.9163618087768555, "rewards/rejected": -5.698285102844238, "step": 14290 }, { "epoch": 0.94, "grad_norm": 8.125, "learning_rate": 6.275284984645413e-08, "logits/chosen": -1.107751488685608, "logits/rejected": -1.3629257678985596, "logps/chosen": -654.9456787109375, "logps/rejected": -897.4703979492188, "loss": 0.4043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.150259017944336, "rewards/margins": 2.2077908515930176, "rewards/rejected": -6.358048915863037, "step": 14300 }, { "epoch": 0.94, "eval_logits/chosen": -1.4988292455673218, "eval_logits/rejected": -1.1337298154830933, "eval_logps/chosen": -672.6912231445312, "eval_logps/rejected": -832.4966430664062, "eval_loss": 0.5155640244483948, "eval_rewards/accuracies": 0.7505000233650208, "eval_rewards/chosen": -4.080717086791992, "eval_rewards/margins": 1.7985320091247559, "eval_rewards/rejected": -5.879249095916748, "eval_runtime": 1082.6662, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 14300 }, { "epoch": 0.94, "grad_norm": 14.0625, "learning_rate": 6.148781523602154e-08, "logits/chosen": -1.271388292312622, "logits/rejected": -1.3283668756484985, "logps/chosen": -617.5052490234375, "logps/rejected": -832.3721923828125, "loss": 0.4665, "rewards/accuracies": 0.75, "rewards/chosen": -3.685497999191284, "rewards/margins": 2.113157272338867, "rewards/rejected": -5.7986555099487305, "step": 14310 }, { "epoch": 0.94, "grad_norm": 49.0, "learning_rate": 6.023550294944253e-08, "logits/chosen": -1.775909662246704, "logits/rejected": -1.0845134258270264, "logps/chosen": -626.0457763671875, "logps/rejected": -818.2127685546875, "loss": 0.3891, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.7062392234802246, "rewards/margins": 2.3490359783172607, "rewards/rejected": -6.055275917053223, "step": 14320 }, { "epoch": 0.94, "grad_norm": 68.0, "learning_rate": 5.899591952034156e-08, "logits/chosen": -1.2532342672348022, "logits/rejected": -0.9101572036743164, "logps/chosen": -649.4820556640625, "logps/rejected": -816.1129760742188, "loss": 0.6037, "rewards/accuracies": 0.625, "rewards/chosen": -4.348361968994141, "rewards/margins": 1.443920373916626, "rewards/rejected": -5.792281627655029, "step": 14330 }, { "epoch": 0.94, "grad_norm": 46.75, "learning_rate": 5.776907141593235e-08, "logits/chosen": -1.6835136413574219, "logits/rejected": -1.2321016788482666, "logps/chosen": -726.80517578125, "logps/rejected": -838.1984252929688, "loss": 0.6008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.308053016662598, "rewards/margins": 1.6610419750213623, "rewards/rejected": -5.969095706939697, "step": 14340 }, { "epoch": 0.94, "grad_norm": 12.75, "learning_rate": 5.655496503698732e-08, "logits/chosen": -1.7603486776351929, "logits/rejected": -1.3747303485870361, "logps/chosen": -720.5882568359375, "logps/rejected": -837.4011840820312, "loss": 0.7365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.227126121520996, "rewards/margins": 1.835129737854004, "rewards/rejected": -6.062255382537842, "step": 14350 }, { "epoch": 0.94, "grad_norm": 39.0, "learning_rate": 5.535360671780038e-08, "logits/chosen": -1.6483482122421265, "logits/rejected": -1.1858994960784912, "logps/chosen": -590.8519287109375, "logps/rejected": -728.4163818359375, "loss": 0.4004, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.592966079711914, "rewards/margins": 1.9303786754608154, "rewards/rejected": -5.523344993591309, "step": 14360 }, { "epoch": 0.94, "grad_norm": 70.5, "learning_rate": 5.416500272615671e-08, "logits/chosen": -1.7025705575942993, "logits/rejected": -1.0617040395736694, "logps/chosen": -753.3171997070312, "logps/rejected": -806.6702270507812, "loss": 0.5642, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.199450492858887, "rewards/margins": 1.2109543085098267, "rewards/rejected": -5.410405158996582, "step": 14370 }, { "epoch": 0.94, "grad_norm": 24.0, "learning_rate": 5.2989159263298895e-08, "logits/chosen": -1.6614692211151123, "logits/rejected": -1.1861495971679688, "logps/chosen": -733.9168701171875, "logps/rejected": -875.22607421875, "loss": 0.3975, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.019890785217285, "rewards/margins": 2.211714267730713, "rewards/rejected": -6.231605529785156, "step": 14380 }, { "epoch": 0.94, "grad_norm": 53.25, "learning_rate": 5.182608246389331e-08, "logits/chosen": -1.253227949142456, "logits/rejected": -1.0729159116744995, "logps/chosen": -692.9468383789062, "logps/rejected": -808.3602294921875, "loss": 0.5685, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.932404518127441, "rewards/margins": 1.26759934425354, "rewards/rejected": -6.200003623962402, "step": 14390 }, { "epoch": 0.94, "grad_norm": 21.625, "learning_rate": 5.067577839600157e-08, "logits/chosen": -1.4120208024978638, "logits/rejected": -0.919410228729248, "logps/chosen": -699.5789184570312, "logps/rejected": -867.6041870117188, "loss": 0.5662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.307889461517334, "rewards/margins": 1.756862998008728, "rewards/rejected": -6.064752578735352, "step": 14400 }, { "epoch": 0.94, "eval_logits/chosen": -1.498733639717102, "eval_logits/rejected": -1.1335071325302124, "eval_logps/chosen": -672.7546997070312, "eval_logps/rejected": -832.6163940429688, "eval_loss": 0.5155320763587952, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.081350803375244, "eval_rewards/margins": 1.7990965843200684, "eval_rewards/rejected": -5.8804473876953125, "eval_runtime": 1082.6971, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 14400 }, { "epoch": 0.94, "grad_norm": 5.53125, "learning_rate": 4.953825306104526e-08, "logits/chosen": -1.2958862781524658, "logits/rejected": -1.2883752584457397, "logps/chosen": -609.2413330078125, "logps/rejected": -808.3864135742188, "loss": 0.6127, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.84196400642395, "rewards/margins": 2.0697789192199707, "rewards/rejected": -5.911743640899658, "step": 14410 }, { "epoch": 0.94, "grad_norm": 6.84375, "learning_rate": 4.8413512393777063e-08, "logits/chosen": -1.540611982345581, "logits/rejected": -0.7908517122268677, "logps/chosen": -770.2626953125, "logps/rejected": -905.3564453125, "loss": 0.5575, "rewards/accuracies": 0.75, "rewards/chosen": -4.568531036376953, "rewards/margins": 1.737302541732788, "rewards/rejected": -6.305833339691162, "step": 14420 }, { "epoch": 0.94, "grad_norm": 33.75, "learning_rate": 4.730156226224775e-08, "logits/chosen": -1.6443665027618408, "logits/rejected": -0.6632070541381836, "logps/chosen": -703.1766357421875, "logps/rejected": -773.6217041015625, "loss": 0.5922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.148341178894043, "rewards/margins": 1.496410846710205, "rewards/rejected": -5.644752025604248, "step": 14430 }, { "epoch": 0.94, "grad_norm": 7.875, "learning_rate": 4.6202408467778124e-08, "logits/chosen": -1.670243501663208, "logits/rejected": -0.9057196378707886, "logps/chosen": -755.6795654296875, "logps/rejected": -948.3333129882812, "loss": 0.5145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.192408084869385, "rewards/margins": 2.144827365875244, "rewards/rejected": -6.337235450744629, "step": 14440 }, { "epoch": 0.95, "grad_norm": 7.3125, "learning_rate": 4.511605674492603e-08, "logits/chosen": -1.5355280637741089, "logits/rejected": -0.9372941255569458, "logps/chosen": -678.46044921875, "logps/rejected": -844.0584716796875, "loss": 0.3516, "rewards/accuracies": 0.875, "rewards/chosen": -3.7402520179748535, "rewards/margins": 2.3936707973480225, "rewards/rejected": -6.133923530578613, "step": 14450 }, { "epoch": 0.95, "grad_norm": 49.0, "learning_rate": 4.404251276145854e-08, "logits/chosen": -1.3825857639312744, "logits/rejected": -1.1840742826461792, "logps/chosen": -669.0966186523438, "logps/rejected": -758.2493286132812, "loss": 0.6961, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.1633710861206055, "rewards/margins": 1.1741771697998047, "rewards/rejected": -5.33754825592041, "step": 14460 }, { "epoch": 0.95, "grad_norm": 10.5, "learning_rate": 4.2981782118321494e-08, "logits/chosen": -1.3598134517669678, "logits/rejected": -1.3417158126831055, "logps/chosen": -610.3348999023438, "logps/rejected": -795.5431518554688, "loss": 0.5826, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.103622913360596, "rewards/margins": 1.4901635646820068, "rewards/rejected": -5.593785762786865, "step": 14470 }, { "epoch": 0.95, "grad_norm": 30.5, "learning_rate": 4.193387034960999e-08, "logits/chosen": -1.499273419380188, "logits/rejected": -1.0753123760223389, "logps/chosen": -704.5675048828125, "logps/rejected": -804.1622924804688, "loss": 0.4204, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.99579119682312, "rewards/margins": 1.5578815937042236, "rewards/rejected": -5.5536723136901855, "step": 14480 }, { "epoch": 0.95, "grad_norm": 30.75, "learning_rate": 4.0898782922539904e-08, "logits/chosen": -1.466566801071167, "logits/rejected": -1.6432005167007446, "logps/chosen": -817.68359375, "logps/rejected": -1022.2804565429688, "loss": 0.7294, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.658608436584473, "rewards/margins": 1.8191381692886353, "rewards/rejected": -6.477746486663818, "step": 14490 }, { "epoch": 0.95, "grad_norm": 9.5, "learning_rate": 3.987652523741975e-08, "logits/chosen": -1.4444339275360107, "logits/rejected": -1.1945425271987915, "logps/chosen": -633.4588623046875, "logps/rejected": -773.0755004882812, "loss": 0.4828, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.7643730640411377, "rewards/margins": 1.7413371801376343, "rewards/rejected": -5.505709648132324, "step": 14500 }, { "epoch": 0.95, "eval_logits/chosen": -1.4990016222000122, "eval_logits/rejected": -1.133996844291687, "eval_logps/chosen": -672.7200927734375, "eval_logps/rejected": -832.5296630859375, "eval_loss": 0.5156899690628052, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.081004619598389, "eval_rewards/margins": 1.7985761165618896, "eval_rewards/rejected": -5.879580020904541, "eval_runtime": 1086.5996, "eval_samples_per_second": 1.841, "eval_steps_per_second": 1.841, "step": 14500 }, { "epoch": 0.95, "grad_norm": 22.75, "learning_rate": 3.8867102627621886e-08, "logits/chosen": -1.490412712097168, "logits/rejected": -1.196725845336914, "logps/chosen": -696.0298461914062, "logps/rejected": -905.2521362304688, "loss": 0.353, "rewards/accuracies": 0.875, "rewards/chosen": -4.203080654144287, "rewards/margins": 2.196010112762451, "rewards/rejected": -6.399090766906738, "step": 14510 }, { "epoch": 0.95, "grad_norm": 16.125, "learning_rate": 3.7870520359554185e-08, "logits/chosen": -0.8974086046218872, "logits/rejected": -1.324383020401001, "logps/chosen": -688.3869018554688, "logps/rejected": -880.1290283203125, "loss": 0.6154, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.464432716369629, "rewards/margins": 1.3156816959381104, "rewards/rejected": -5.78011417388916, "step": 14520 }, { "epoch": 0.95, "grad_norm": 16.625, "learning_rate": 3.688678363263476e-08, "logits/chosen": -1.51911199092865, "logits/rejected": -1.223510980606079, "logps/chosen": -699.217041015625, "logps/rejected": -888.0908203125, "loss": 0.401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.9736831188201904, "rewards/margins": 2.2153079509735107, "rewards/rejected": -6.188991069793701, "step": 14530 }, { "epoch": 0.95, "grad_norm": 33.25, "learning_rate": 3.5915897579262e-08, "logits/chosen": -1.5013211965560913, "logits/rejected": -1.3283087015151978, "logps/chosen": -625.1622314453125, "logps/rejected": -806.5089721679688, "loss": 0.526, "rewards/accuracies": 0.75, "rewards/chosen": -3.7293975353240967, "rewards/margins": 1.6434872150421143, "rewards/rejected": -5.372884273529053, "step": 14540 }, { "epoch": 0.95, "grad_norm": 86.5, "learning_rate": 3.495786726478934e-08, "logits/chosen": -1.5146243572235107, "logits/rejected": -0.9294026494026184, "logps/chosen": -740.7523193359375, "logps/rejected": -847.3167114257812, "loss": 0.7087, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.267483711242676, "rewards/margins": 1.6665290594100952, "rewards/rejected": -5.934011936187744, "step": 14550 }, { "epoch": 0.95, "grad_norm": 20.125, "learning_rate": 3.401269768749965e-08, "logits/chosen": -1.2246670722961426, "logits/rejected": -1.3304896354675293, "logps/chosen": -689.7512817382812, "logps/rejected": -838.3968505859375, "loss": 0.7249, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.550902843475342, "rewards/margins": 1.3026988506317139, "rewards/rejected": -5.853601932525635, "step": 14560 }, { "epoch": 0.95, "grad_norm": 21.5, "learning_rate": 3.3080393778577305e-08, "logits/chosen": -1.5191315412521362, "logits/rejected": -0.6012996435165405, "logps/chosen": -691.9611206054688, "logps/rejected": -845.7371215820312, "loss": 0.5957, "rewards/accuracies": 0.75, "rewards/chosen": -3.995393753051758, "rewards/margins": 2.2696304321289062, "rewards/rejected": -6.265023708343506, "step": 14570 }, { "epoch": 0.95, "grad_norm": 41.0, "learning_rate": 3.2160960402083105e-08, "logits/chosen": -1.5548509359359741, "logits/rejected": -0.7060686349868774, "logps/chosen": -775.2296752929688, "logps/rejected": -1044.853271484375, "loss": 0.2926, "rewards/accuracies": 0.875, "rewards/chosen": -4.452037811279297, "rewards/margins": 2.910086154937744, "rewards/rejected": -7.362123966217041, "step": 14580 }, { "epoch": 0.95, "grad_norm": 25.375, "learning_rate": 3.125440235492938e-08, "logits/chosen": -1.4717189073562622, "logits/rejected": -0.9017935991287231, "logps/chosen": -744.6813354492188, "logps/rejected": -921.09619140625, "loss": 0.4604, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.509668350219727, "rewards/margins": 1.9908921718597412, "rewards/rejected": -6.500560760498047, "step": 14590 }, { "epoch": 0.96, "grad_norm": 21.625, "learning_rate": 3.036072436685494e-08, "logits/chosen": -0.8438253402709961, "logits/rejected": -0.7805960774421692, "logps/chosen": -598.6583251953125, "logps/rejected": -848.5973510742188, "loss": 0.5555, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.2352776527404785, "rewards/margins": 2.0711169242858887, "rewards/rejected": -6.306394577026367, "step": 14600 }, { "epoch": 0.96, "eval_logits/chosen": -1.4989516735076904, "eval_logits/rejected": -1.1335021257400513, "eval_logps/chosen": -672.670654296875, "eval_logps/rejected": -832.4429931640625, "eval_loss": 0.5157020092010498, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.080509662628174, "eval_rewards/margins": 1.7982033491134644, "eval_rewards/rejected": -5.878713607788086, "eval_runtime": 1082.7722, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 14600 }, { "epoch": 0.96, "grad_norm": 15.25, "learning_rate": 2.9479931100399594e-08, "logits/chosen": -1.3217475414276123, "logits/rejected": -1.43135666847229, "logps/chosen": -689.2044067382812, "logps/rejected": -887.3733520507812, "loss": 0.5641, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.110635280609131, "rewards/margins": 2.011808156967163, "rewards/rejected": -6.122443199157715, "step": 14610 }, { "epoch": 0.96, "grad_norm": 22.0, "learning_rate": 2.8612027150880517e-08, "logits/chosen": -1.9152240753173828, "logits/rejected": -1.1616852283477783, "logps/chosen": -674.3377685546875, "logps/rejected": -855.0958251953125, "loss": 0.4445, "rewards/accuracies": 0.75, "rewards/chosen": -3.7798867225646973, "rewards/margins": 2.4780044555664062, "rewards/rejected": -6.2578911781311035, "step": 14620 }, { "epoch": 0.96, "grad_norm": 17.5, "learning_rate": 2.7757017046368685e-08, "logits/chosen": -1.4314361810684204, "logits/rejected": -1.1097551584243774, "logps/chosen": -725.2561645507812, "logps/rejected": -833.2628173828125, "loss": 0.5833, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.297665596008301, "rewards/margins": 1.4585754871368408, "rewards/rejected": -5.756241798400879, "step": 14630 }, { "epoch": 0.96, "grad_norm": 3.484375, "learning_rate": 2.691490524766388e-08, "logits/chosen": -1.7065837383270264, "logits/rejected": -1.2123456001281738, "logps/chosen": -639.0941162109375, "logps/rejected": -783.9685668945312, "loss": 0.5274, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.5798423290252686, "rewards/margins": 1.883596658706665, "rewards/rejected": -5.463438987731934, "step": 14640 }, { "epoch": 0.96, "grad_norm": 12.75, "learning_rate": 2.6085696148272775e-08, "logits/chosen": -1.3226919174194336, "logits/rejected": -0.9627928733825684, "logps/chosen": -660.2105102539062, "logps/rejected": -845.9801635742188, "loss": 0.5592, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.023497104644775, "rewards/margins": 1.8011744022369385, "rewards/rejected": -5.824671745300293, "step": 14650 }, { "epoch": 0.96, "grad_norm": 23.0, "learning_rate": 2.526939407438478e-08, "logits/chosen": -1.4593091011047363, "logits/rejected": -1.4013919830322266, "logps/chosen": -617.6576538085938, "logps/rejected": -816.8186645507812, "loss": 0.4457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.800004243850708, "rewards/margins": 2.120269775390625, "rewards/rejected": -5.92027473449707, "step": 14660 }, { "epoch": 0.96, "grad_norm": 61.0, "learning_rate": 2.4466003284851214e-08, "logits/chosen": -1.5862547159194946, "logits/rejected": -1.2162004709243774, "logps/chosen": -751.1326904296875, "logps/rejected": -802.6964111328125, "loss": 0.9363, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.71038293838501, "rewards/margins": 0.7390093803405762, "rewards/rejected": -5.449391841888428, "step": 14670 }, { "epoch": 0.96, "grad_norm": 11.9375, "learning_rate": 2.3675527971160905e-08, "logits/chosen": -1.736262559890747, "logits/rejected": -1.372564435005188, "logps/chosen": -719.0181274414062, "logps/rejected": -858.1018676757812, "loss": 0.726, "rewards/accuracies": 0.625, "rewards/chosen": -3.9259440898895264, "rewards/margins": 1.499529480934143, "rewards/rejected": -5.425474166870117, "step": 14680 }, { "epoch": 0.96, "grad_norm": 43.0, "learning_rate": 2.2897972257420465e-08, "logits/chosen": -1.585780143737793, "logits/rejected": -0.9886649250984192, "logps/chosen": -656.6448974609375, "logps/rejected": -802.1618041992188, "loss": 0.5761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.8232007026672363, "rewards/margins": 1.6648178100585938, "rewards/rejected": -5.488018035888672, "step": 14690 }, { "epoch": 0.96, "grad_norm": 65.5, "learning_rate": 2.21333402003307e-08, "logits/chosen": -1.5568093061447144, "logits/rejected": -1.4344760179519653, "logps/chosen": -685.9212646484375, "logps/rejected": -763.9901123046875, "loss": 0.704, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.141175270080566, "rewards/margins": 1.0672465562820435, "rewards/rejected": -5.208422660827637, "step": 14700 }, { "epoch": 0.96, "eval_logits/chosen": -1.4989476203918457, "eval_logits/rejected": -1.1337580680847168, "eval_logps/chosen": -672.6378173828125, "eval_logps/rejected": -832.4694213867188, "eval_loss": 0.5155410766601562, "eval_rewards/accuracies": 0.7505000233650208, "eval_rewards/chosen": -4.0801825523376465, "eval_rewards/margins": 1.798795223236084, "eval_rewards/rejected": -5.878978729248047, "eval_runtime": 1082.7497, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 14700 }, { "epoch": 0.96, "grad_norm": 33.0, "learning_rate": 2.138163578916719e-08, "logits/chosen": -1.822434425354004, "logits/rejected": -1.4496662616729736, "logps/chosen": -745.0465087890625, "logps/rejected": -825.1981201171875, "loss": 0.3544, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.8859755992889404, "rewards/margins": 1.5678056478500366, "rewards/rejected": -5.453780651092529, "step": 14710 }, { "epoch": 0.96, "grad_norm": 72.5, "learning_rate": 2.0642862945758625e-08, "logits/chosen": -1.3334019184112549, "logits/rejected": -0.5126091837882996, "logps/chosen": -634.9332275390625, "logps/rejected": -792.2523803710938, "loss": 0.362, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.824808120727539, "rewards/margins": 2.1914801597595215, "rewards/rejected": -6.016289234161377, "step": 14720 }, { "epoch": 0.96, "grad_norm": 12.0, "learning_rate": 1.991702552446656e-08, "logits/chosen": -1.267516851425171, "logits/rejected": -1.132657766342163, "logps/chosen": -605.05224609375, "logps/rejected": -800.3802490234375, "loss": 0.6295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.9204049110412598, "rewards/margins": 2.0827763080596924, "rewards/rejected": -6.003181457519531, "step": 14730 }, { "epoch": 0.96, "grad_norm": 32.0, "learning_rate": 1.9204127312164866e-08, "logits/chosen": -1.1736063957214355, "logits/rejected": -0.9303300976753235, "logps/chosen": -640.8338012695312, "logps/rejected": -848.6588134765625, "loss": 0.4759, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.109316825866699, "rewards/margins": 2.118309259414673, "rewards/rejected": -6.227626323699951, "step": 14740 }, { "epoch": 0.97, "grad_norm": 29.625, "learning_rate": 1.8504172028220858e-08, "logits/chosen": -1.183397889137268, "logits/rejected": -1.1711453199386597, "logps/chosen": -695.2463989257812, "logps/rejected": -784.6041870117188, "loss": 0.8545, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.528311729431152, "rewards/margins": 1.0655442476272583, "rewards/rejected": -5.593855857849121, "step": 14750 }, { "epoch": 0.97, "grad_norm": 7.84375, "learning_rate": 1.7817163324475306e-08, "logits/chosen": -1.3552110195159912, "logits/rejected": -0.7475605010986328, "logps/chosen": -642.6542358398438, "logps/rejected": -863.9797973632812, "loss": 0.395, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.716207981109619, "rewards/margins": 2.7749006748199463, "rewards/rejected": -6.491107940673828, "step": 14760 }, { "epoch": 0.97, "grad_norm": 18.625, "learning_rate": 1.7143104785222465e-08, "logits/chosen": -1.7113873958587646, "logits/rejected": -1.2836427688598633, "logps/chosen": -745.9172973632812, "logps/rejected": -881.0494384765625, "loss": 0.3475, "rewards/accuracies": 0.875, "rewards/chosen": -4.036852836608887, "rewards/margins": 1.9993245601654053, "rewards/rejected": -6.036176681518555, "step": 14770 }, { "epoch": 0.97, "grad_norm": 74.5, "learning_rate": 1.6481999927194238e-08, "logits/chosen": -1.6346254348754883, "logits/rejected": -0.41228994727134705, "logps/chosen": -690.8013916015625, "logps/rejected": -706.4359130859375, "loss": 0.7439, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.003089904785156, "rewards/margins": 1.2065765857696533, "rewards/rejected": -5.209666728973389, "step": 14780 }, { "epoch": 0.97, "grad_norm": 28.375, "learning_rate": 1.583385219953826e-08, "logits/chosen": -1.4589871168136597, "logits/rejected": -0.9129419326782227, "logps/chosen": -686.2401123046875, "logps/rejected": -888.7102661132812, "loss": 0.3906, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.708935260772705, "rewards/margins": 2.2104568481445312, "rewards/rejected": -5.919392108917236, "step": 14790 }, { "epoch": 0.97, "grad_norm": 87.5, "learning_rate": 1.5198664983802346e-08, "logits/chosen": -1.5424673557281494, "logits/rejected": -0.5884643793106079, "logps/chosen": -742.8410034179688, "logps/rejected": -774.7767333984375, "loss": 0.7164, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.436744213104248, "rewards/margins": 1.2054417133331299, "rewards/rejected": -5.642185211181641, "step": 14800 }, { "epoch": 0.97, "eval_logits/chosen": -1.499094843864441, "eval_logits/rejected": -1.1339683532714844, "eval_logps/chosen": -672.6747436523438, "eval_logps/rejected": -832.5261840820312, "eval_loss": 0.5157620906829834, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.0805511474609375, "eval_rewards/margins": 1.7989939451217651, "eval_rewards/rejected": -5.879545211791992, "eval_runtime": 1082.6464, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 14800 }, { "epoch": 0.97, "grad_norm": 1.7421875, "learning_rate": 1.4576441593915914e-08, "logits/chosen": -1.320824384689331, "logits/rejected": -0.8941282033920288, "logps/chosen": -552.009765625, "logps/rejected": -806.8757934570312, "loss": 0.3929, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.634228229522705, "rewards/margins": 2.4304652214050293, "rewards/rejected": -6.064693927764893, "step": 14810 }, { "epoch": 0.97, "grad_norm": 60.5, "learning_rate": 1.39671852761733e-08, "logits/chosen": -1.4982420206069946, "logits/rejected": -0.9083837270736694, "logps/chosen": -779.04443359375, "logps/rejected": -793.7080078125, "loss": 0.6733, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.489626884460449, "rewards/margins": 1.4485924243927002, "rewards/rejected": -5.9382195472717285, "step": 14820 }, { "epoch": 0.97, "grad_norm": 25.75, "learning_rate": 1.3370899209216027e-08, "logits/chosen": -1.1626901626586914, "logits/rejected": -1.005846619606018, "logps/chosen": -641.0983276367188, "logps/rejected": -829.1727294921875, "loss": 0.568, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.073776721954346, "rewards/margins": 1.6865482330322266, "rewards/rejected": -5.760324954986572, "step": 14830 }, { "epoch": 0.97, "grad_norm": 32.25, "learning_rate": 1.2787586504016125e-08, "logits/chosen": -1.5545995235443115, "logits/rejected": -1.2467468976974487, "logps/chosen": -609.7913818359375, "logps/rejected": -871.1988525390625, "loss": 0.3519, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7171998023986816, "rewards/margins": 2.801652193069458, "rewards/rejected": -6.518851280212402, "step": 14840 }, { "epoch": 0.97, "grad_norm": 35.0, "learning_rate": 1.2217250203861441e-08, "logits/chosen": -1.6491849422454834, "logits/rejected": -1.1245169639587402, "logps/chosen": -715.8529052734375, "logps/rejected": -872.3297729492188, "loss": 0.4356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.860668659210205, "rewards/margins": 1.6028010845184326, "rewards/rejected": -6.463469505310059, "step": 14850 }, { "epoch": 0.97, "grad_norm": 8.4375, "learning_rate": 1.1659893284338141e-08, "logits/chosen": -1.6428101062774658, "logits/rejected": -1.1711459159851074, "logps/chosen": -642.7400512695312, "logps/rejected": -912.45361328125, "loss": 0.3053, "rewards/accuracies": 0.875, "rewards/chosen": -3.5390942096710205, "rewards/margins": 2.695460796356201, "rewards/rejected": -6.234554767608643, "step": 14860 }, { "epoch": 0.97, "grad_norm": 30.25, "learning_rate": 1.111551865331517e-08, "logits/chosen": -1.5772109031677246, "logits/rejected": -0.9146392941474915, "logps/chosen": -699.5916137695312, "logps/rejected": -868.1456909179688, "loss": 0.4894, "rewards/accuracies": 0.75, "rewards/chosen": -3.8581337928771973, "rewards/margins": 2.0164055824279785, "rewards/rejected": -5.874539375305176, "step": 14870 }, { "epoch": 0.97, "grad_norm": 42.5, "learning_rate": 1.0584129150930656e-08, "logits/chosen": -1.5206120014190674, "logits/rejected": -1.4517956972122192, "logps/chosen": -635.8982543945312, "logps/rejected": -847.0305786132812, "loss": 0.4494, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.7315566539764404, "rewards/margins": 1.9645030498504639, "rewards/rejected": -5.696059703826904, "step": 14880 }, { "epoch": 0.97, "grad_norm": 10.0, "learning_rate": 1.0065727549575521e-08, "logits/chosen": -0.8860852122306824, "logits/rejected": -1.0874881744384766, "logps/chosen": -547.4925537109375, "logps/rejected": -725.10888671875, "loss": 0.5471, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8739025592803955, "rewards/margins": 1.367903232574463, "rewards/rejected": -5.2418060302734375, "step": 14890 }, { "epoch": 0.97, "grad_norm": 21.0, "learning_rate": 9.5603165538799e-09, "logits/chosen": -1.2977436780929565, "logits/rejected": -0.4156854748725891, "logps/chosen": -663.11474609375, "logps/rejected": -824.2686767578125, "loss": 0.3263, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.8760039806365967, "rewards/margins": 2.249363422393799, "rewards/rejected": -6.125367164611816, "step": 14900 }, { "epoch": 0.97, "eval_logits/chosen": -1.4993937015533447, "eval_logits/rejected": -1.1338773965835571, "eval_logps/chosen": -672.5684814453125, "eval_logps/rejected": -832.3969116210938, "eval_loss": 0.5154815912246704, "eval_rewards/accuracies": 0.7509999871253967, "eval_rewards/chosen": -4.079488754272461, "eval_rewards/margins": 1.798763394355774, "eval_rewards/rejected": -5.878252029418945, "eval_runtime": 1082.6956, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 14900 }, { "epoch": 0.98, "grad_norm": 24.625, "learning_rate": 9.067898800698416e-09, "logits/chosen": -1.900448203086853, "logits/rejected": -1.5315290689468384, "logps/chosen": -645.0261840820312, "logps/rejected": -793.4893188476562, "loss": 0.3409, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7169525623321533, "rewards/margins": 1.7609758377075195, "rewards/rejected": -5.477928638458252, "step": 14910 }, { "epoch": 0.98, "grad_norm": 19.625, "learning_rate": 8.588476859096584e-09, "logits/chosen": -1.6443464756011963, "logits/rejected": -0.9735485911369324, "logps/chosen": -661.150390625, "logps/rejected": -828.80224609375, "loss": 0.571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.893770694732666, "rewards/margins": 1.9429336786270142, "rewards/rejected": -5.836704254150391, "step": 14920 }, { "epoch": 0.98, "grad_norm": 32.75, "learning_rate": 8.122053230338045e-09, "logits/chosen": -1.5683542490005493, "logits/rejected": -1.5405677556991577, "logps/chosen": -661.5060424804688, "logps/rejected": -833.9655151367188, "loss": 0.5751, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.9671859741210938, "rewards/margins": 1.3349368572235107, "rewards/rejected": -5.302123069763184, "step": 14930 }, { "epoch": 0.98, "grad_norm": 29.375, "learning_rate": 7.668630347870688e-09, "logits/chosen": -1.3812329769134521, "logits/rejected": -1.0269170999526978, "logps/chosen": -584.7435302734375, "logps/rejected": -754.8016357421875, "loss": 0.7606, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.0077104568481445, "rewards/margins": 1.5782127380371094, "rewards/rejected": -5.585923194885254, "step": 14940 }, { "epoch": 0.98, "grad_norm": 47.5, "learning_rate": 7.2282105773144315e-09, "logits/chosen": -1.3857173919677734, "logits/rejected": -1.0819319486618042, "logps/chosen": -653.53076171875, "logps/rejected": -813.7086181640625, "loss": 0.5398, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9572224617004395, "rewards/margins": 1.6748088598251343, "rewards/rejected": -5.632031440734863, "step": 14950 }, { "epoch": 0.98, "grad_norm": 45.5, "learning_rate": 6.800796216448191e-09, "logits/chosen": -1.760093331336975, "logits/rejected": -1.6171455383300781, "logps/chosen": -710.0787353515625, "logps/rejected": -791.6492309570312, "loss": 0.6757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.9908242225646973, "rewards/margins": 1.3836671113967896, "rewards/rejected": -5.3744916915893555, "step": 14960 }, { "epoch": 0.98, "grad_norm": 11.75, "learning_rate": 6.3863894951993185e-09, "logits/chosen": -1.515354037284851, "logits/rejected": -0.8208287358283997, "logps/chosen": -677.3696899414062, "logps/rejected": -785.3562622070312, "loss": 0.4713, "rewards/accuracies": 0.75, "rewards/chosen": -3.821608781814575, "rewards/margins": 1.901656150817871, "rewards/rejected": -5.723265171051025, "step": 14970 }, { "epoch": 0.98, "grad_norm": 39.5, "learning_rate": 5.9849925756302885e-09, "logits/chosen": -1.6482946872711182, "logits/rejected": -1.3099112510681152, "logps/chosen": -627.640869140625, "logps/rejected": -740.9942016601562, "loss": 0.6365, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.6764042377471924, "rewards/margins": 1.3432334661483765, "rewards/rejected": -5.0196380615234375, "step": 14980 }, { "epoch": 0.98, "grad_norm": 5.46875, "learning_rate": 5.596607551928979e-09, "logits/chosen": -1.2369056940078735, "logits/rejected": -1.0761168003082275, "logps/chosen": -640.3348999023438, "logps/rejected": -826.6220703125, "loss": 0.4183, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8065619468688965, "rewards/margins": 2.105325222015381, "rewards/rejected": -5.911887168884277, "step": 14990 }, { "epoch": 0.98, "grad_norm": 2.03125, "learning_rate": 5.221236450396183e-09, "logits/chosen": -1.5033068656921387, "logits/rejected": -1.3899309635162354, "logps/chosen": -640.50537109375, "logps/rejected": -841.1510620117188, "loss": 0.3809, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.548269748687744, "rewards/margins": 2.5847649574279785, "rewards/rejected": -6.133034706115723, "step": 15000 }, { "epoch": 0.98, "eval_logits/chosen": -1.499165654182434, "eval_logits/rejected": -1.1336843967437744, "eval_logps/chosen": -672.6626586914062, "eval_logps/rejected": -832.5026245117188, "eval_loss": 0.515548586845398, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -4.08043098449707, "eval_rewards/margins": 1.7988783121109009, "eval_rewards/rejected": -5.87930965423584, "eval_runtime": 1082.6689, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 15000 }, { "epoch": 0.98, "grad_norm": 43.25, "learning_rate": 4.858881229436174e-09, "logits/chosen": -2.0318174362182617, "logits/rejected": -0.9047209620475769, "logps/chosen": -776.1448364257812, "logps/rejected": -877.9874267578125, "loss": 0.3758, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.069352149963379, "rewards/margins": 1.9717031717300415, "rewards/rejected": -6.041055202484131, "step": 15010 }, { "epoch": 0.98, "grad_norm": 11.125, "learning_rate": 4.509543779546433e-09, "logits/chosen": -1.3191606998443604, "logits/rejected": -1.4304561614990234, "logps/chosen": -637.6331176757812, "logps/rejected": -841.3294677734375, "loss": 0.4604, "rewards/accuracies": 0.75, "rewards/chosen": -3.8065552711486816, "rewards/margins": 2.0067198276519775, "rewards/rejected": -5.813275337219238, "step": 15020 }, { "epoch": 0.98, "grad_norm": 8.0, "learning_rate": 4.1732259233071004e-09, "logits/chosen": -1.4344446659088135, "logits/rejected": -0.8155355453491211, "logps/chosen": -674.0362548828125, "logps/rejected": -768.2298583984375, "loss": 0.5534, "rewards/accuracies": 0.75, "rewards/chosen": -3.920776844024658, "rewards/margins": 1.5957410335540771, "rewards/rejected": -5.5165181159973145, "step": 15030 }, { "epoch": 0.98, "grad_norm": 19.875, "learning_rate": 3.849929415371823e-09, "logits/chosen": -1.7249119281768799, "logits/rejected": -1.158717393875122, "logps/chosen": -654.02685546875, "logps/rejected": -877.78759765625, "loss": 0.3459, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5320918560028076, "rewards/margins": 2.890892505645752, "rewards/rejected": -6.4229841232299805, "step": 15040 }, { "epoch": 0.98, "grad_norm": 22.5, "learning_rate": 3.5396559424591416e-09, "logits/chosen": -1.4106509685516357, "logits/rejected": -1.337409257888794, "logps/chosen": -695.9830322265625, "logps/rejected": -807.8668212890625, "loss": 0.8018, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.744257926940918, "rewards/margins": 1.1727367639541626, "rewards/rejected": -5.916995048522949, "step": 15050 }, { "epoch": 0.99, "grad_norm": 35.25, "learning_rate": 3.242407123342506e-09, "logits/chosen": -1.4338390827178955, "logits/rejected": -1.2881247997283936, "logps/chosen": -686.8263549804688, "logps/rejected": -857.0071411132812, "loss": 0.4319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.05562162399292, "rewards/margins": 1.693246841430664, "rewards/rejected": -5.748867988586426, "step": 15060 }, { "epoch": 0.99, "grad_norm": 40.75, "learning_rate": 2.9581845088430537e-09, "logits/chosen": -1.2244328260421753, "logits/rejected": -1.101989984512329, "logps/chosen": -721.6100463867188, "logps/rejected": -963.9714965820312, "loss": 0.4082, "rewards/accuracies": 0.75, "rewards/chosen": -4.208508491516113, "rewards/margins": 2.4751088619232178, "rewards/rejected": -6.68361759185791, "step": 15070 }, { "epoch": 0.99, "grad_norm": 22.625, "learning_rate": 2.686989581820454e-09, "logits/chosen": -1.5228071212768555, "logits/rejected": -1.4482409954071045, "logps/chosen": -590.4705810546875, "logps/rejected": -796.4525146484375, "loss": 0.5071, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.9141297340393066, "rewards/margins": 1.6521908044815063, "rewards/rejected": -5.566319942474365, "step": 15080 }, { "epoch": 0.99, "grad_norm": 2.84375, "learning_rate": 2.4288237571665206e-09, "logits/chosen": -1.571813702583313, "logits/rejected": -1.1080796718597412, "logps/chosen": -604.7337646484375, "logps/rejected": -806.4271240234375, "loss": 0.313, "rewards/accuracies": 0.875, "rewards/chosen": -3.7468886375427246, "rewards/margins": 2.1171717643737793, "rewards/rejected": -5.864060878753662, "step": 15090 }, { "epoch": 0.99, "grad_norm": 10.375, "learning_rate": 2.183688381796056e-09, "logits/chosen": -1.258375883102417, "logits/rejected": -1.2669404745101929, "logps/chosen": -681.5723876953125, "logps/rejected": -902.41162109375, "loss": 0.4781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.094157695770264, "rewards/margins": 2.1871683597564697, "rewards/rejected": -6.2813262939453125, "step": 15100 }, { "epoch": 0.99, "eval_logits/chosen": -1.499052882194519, "eval_logits/rejected": -1.1336398124694824, "eval_logps/chosen": -672.708251953125, "eval_logps/rejected": -832.45849609375, "eval_loss": 0.5158145427703857, "eval_rewards/accuracies": 0.7494999766349792, "eval_rewards/chosen": -4.0808868408203125, "eval_rewards/margins": 1.797980785369873, "eval_rewards/rejected": -5.8788676261901855, "eval_runtime": 1082.8206, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 15100 }, { "epoch": 0.99, "grad_norm": 25.125, "learning_rate": 1.9515847346412986e-09, "logits/chosen": -1.5806621313095093, "logits/rejected": -1.119863748550415, "logps/chosen": -629.0213012695312, "logps/rejected": -754.7540283203125, "loss": 0.5291, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.810438871383667, "rewards/margins": 1.3979140520095825, "rewards/rejected": -5.208353519439697, "step": 15110 }, { "epoch": 0.99, "grad_norm": 34.5, "learning_rate": 1.7325140266449825e-09, "logits/chosen": -1.3101478815078735, "logits/rejected": -0.9645036458969116, "logps/chosen": -672.7442016601562, "logps/rejected": -803.9622802734375, "loss": 0.5583, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.15883207321167, "rewards/margins": 1.7983806133270264, "rewards/rejected": -5.957211971282959, "step": 15120 }, { "epoch": 0.99, "grad_norm": 34.5, "learning_rate": 1.526477400752846e-09, "logits/chosen": -1.4830574989318848, "logits/rejected": -0.8379748463630676, "logps/chosen": -697.93310546875, "logps/rejected": -853.8567504882812, "loss": 0.7395, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.242354393005371, "rewards/margins": 1.6751320362091064, "rewards/rejected": -5.917486667633057, "step": 15130 }, { "epoch": 0.99, "grad_norm": 6.125, "learning_rate": 1.3334759319097447e-09, "logits/chosen": -1.2374359369277954, "logits/rejected": -0.9841200113296509, "logps/chosen": -634.9082641601562, "logps/rejected": -882.9581298828125, "loss": 0.2607, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.7476706504821777, "rewards/margins": 2.644453287124634, "rewards/rejected": -6.392125129699707, "step": 15140 }, { "epoch": 0.99, "grad_norm": 22.0, "learning_rate": 1.1535106270518793e-09, "logits/chosen": -1.8277978897094727, "logits/rejected": -0.6091334223747253, "logps/chosen": -696.5634765625, "logps/rejected": -895.7308349609375, "loss": 0.4254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.882366895675659, "rewards/margins": 2.4781439304351807, "rewards/rejected": -6.36051082611084, "step": 15150 }, { "epoch": 0.99, "grad_norm": 11.375, "learning_rate": 9.865824251031886e-10, "logits/chosen": -1.5557177066802979, "logits/rejected": -1.548560380935669, "logps/chosen": -641.8161010742188, "logps/rejected": -791.4983520507812, "loss": 0.6152, "rewards/accuracies": 0.75, "rewards/chosen": -4.355650901794434, "rewards/margins": 1.5758931636810303, "rewards/rejected": -5.931543350219727, "step": 15160 }, { "epoch": 0.99, "grad_norm": 19.625, "learning_rate": 8.326921969692425e-10, "logits/chosen": -1.7205381393432617, "logits/rejected": -1.1034014225006104, "logps/chosen": -673.2122192382812, "logps/rejected": -783.8216552734375, "loss": 0.6822, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.118476867675781, "rewards/margins": 1.2156922817230225, "rewards/rejected": -5.334169387817383, "step": 15170 }, { "epoch": 0.99, "grad_norm": 32.0, "learning_rate": 6.918407455339116e-10, "logits/chosen": -1.6896483898162842, "logits/rejected": -1.1890302896499634, "logps/chosen": -748.822021484375, "logps/rejected": -869.2823486328125, "loss": 0.6371, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.583366870880127, "rewards/margins": 1.3712291717529297, "rewards/rejected": -5.954596042633057, "step": 15180 }, { "epoch": 0.99, "grad_norm": 68.0, "learning_rate": 5.640288056540932e-10, "logits/chosen": -1.510573387145996, "logits/rejected": -1.44536292552948, "logps/chosen": -631.5179443359375, "logps/rejected": -754.64697265625, "loss": 0.682, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.041640758514404, "rewards/margins": 1.1729449033737183, "rewards/rejected": -5.214585781097412, "step": 15190 }, { "epoch": 0.99, "grad_norm": 11.875, "learning_rate": 4.492570441563815e-10, "logits/chosen": -1.525930404663086, "logits/rejected": -1.0159375667572021, "logps/chosen": -577.1013793945312, "logps/rejected": -772.0386962890625, "loss": 0.5115, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.5004782676696777, "rewards/margins": 1.9235942363739014, "rewards/rejected": -5.424071788787842, "step": 15200 }, { "epoch": 0.99, "eval_logits/chosen": -1.4991191625595093, "eval_logits/rejected": -1.133660078048706, "eval_logps/chosen": -672.6617431640625, "eval_logps/rejected": -832.3694458007812, "eval_loss": 0.5158668160438538, "eval_rewards/accuracies": 0.7475000023841858, "eval_rewards/chosen": -4.080421447753906, "eval_rewards/margins": 1.7975566387176514, "eval_rewards/rejected": -5.8779778480529785, "eval_runtime": 1082.6791, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 15200 }, { "epoch": 1.0, "grad_norm": 29.625, "learning_rate": 3.47526059833736e-10, "logits/chosen": -1.9035943746566772, "logits/rejected": -1.4374911785125732, "logps/chosen": -777.3638916015625, "logps/rejected": -838.9942626953125, "loss": 0.6063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.975438356399536, "rewards/margins": 0.955004096031189, "rewards/rejected": -4.9304423332214355, "step": 15210 }, { "epoch": 1.0, "grad_norm": 33.5, "learning_rate": 2.5883638344159636e-10, "logits/chosen": -1.4447400569915771, "logits/rejected": -1.0255532264709473, "logps/chosen": -608.6077880859375, "logps/rejected": -816.8342895507812, "loss": 0.4727, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.4336369037628174, "rewards/margins": 2.4046621322631836, "rewards/rejected": -5.838299751281738, "step": 15220 }, { "epoch": 1.0, "grad_norm": 8.75, "learning_rate": 1.8318847769649428e-10, "logits/chosen": -1.6346423625946045, "logits/rejected": -1.1284949779510498, "logps/chosen": -702.5090942382812, "logps/rejected": -884.5067138671875, "loss": 0.57, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.193155765533447, "rewards/margins": 1.920485258102417, "rewards/rejected": -6.113640785217285, "step": 15230 }, { "epoch": 1.0, "grad_norm": 32.25, "learning_rate": 1.2058273727216797e-10, "logits/chosen": -1.4720618724822998, "logits/rejected": -1.399948000907898, "logps/chosen": -682.2576904296875, "logps/rejected": -790.8546752929688, "loss": 0.6199, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.146370887756348, "rewards/margins": 1.5523508787155151, "rewards/rejected": -5.698721885681152, "step": 15240 }, { "epoch": 1.0, "grad_norm": 53.75, "learning_rate": 7.101948879845167e-11, "logits/chosen": -1.395618200302124, "logits/rejected": -0.9973516464233398, "logps/chosen": -664.3514404296875, "logps/rejected": -823.5393676757812, "loss": 0.5459, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.943124771118164, "rewards/margins": 2.1066854000091553, "rewards/rejected": -6.04980993270874, "step": 15250 }, { "epoch": 1.0, "grad_norm": 10.9375, "learning_rate": 3.4498990858777835e-11, "logits/chosen": -1.4538891315460205, "logits/rejected": -1.005915880203247, "logps/chosen": -605.2978515625, "logps/rejected": -825.0804443359375, "loss": 0.418, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.689631223678589, "rewards/margins": 2.339841365814209, "rewards/rejected": -6.029472827911377, "step": 15260 }, { "epoch": 1.0, "grad_norm": 7.65625, "learning_rate": 1.1021433989899522e-11, "logits/chosen": -1.4069284200668335, "logits/rejected": -0.5471193790435791, "logps/chosen": -684.0292358398438, "logps/rejected": -856.8021240234375, "loss": 0.5406, "rewards/accuracies": 0.75, "rewards/chosen": -4.023730754852295, "rewards/margins": 2.395432233810425, "rewards/rejected": -6.419162750244141, "step": 15270 }, { "epoch": 1.0, "grad_norm": 20.125, "learning_rate": 5.869406799474675e-13, "logits/chosen": -1.5001615285873413, "logits/rejected": -1.351322889328003, "logps/chosen": -733.93115234375, "logps/rejected": -812.1211547851562, "loss": 0.6457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.38846492767334, "rewards/margins": 1.3247134685516357, "rewards/rejected": -5.713177680969238, "step": 15280 }, { "epoch": 1.0, "step": 15283, "total_flos": 0.0, "train_loss": 0.5464587531257954, "train_runtime": 239691.3187, "train_samples_per_second": 0.255, "train_steps_per_second": 0.064 } ], "logging_steps": 10, "max_steps": 15283, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }