{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998451213216314, "eval_steps": 100, "global_step": 2904, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.7182130584192438e-09, "logits/chosen": 22.749126434326172, "logits/rejected": 22.455398559570312, "logps/chosen": -415.7331848144531, "logps/rejected": -294.51483154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "eval_logits/chosen": 23.82334327697754, "eval_logits/rejected": 23.573287963867188, "eval_logps/chosen": -354.5701599121094, "eval_logps/rejected": -274.08343505859375, "eval_loss": 0.6931473612785339, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 208.2485, "eval_samples_per_second": 9.604, "eval_steps_per_second": 0.303, "step": 1 }, { "epoch": 0.01, "learning_rate": 1.718213058419244e-08, "logits/chosen": 23.493385314941406, "logits/rejected": 23.479415893554688, "logps/chosen": -359.0509948730469, "logps/rejected": -263.7375793457031, "loss": 0.692, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.016306404024362564, "rewards/margins": 0.025918345898389816, "rewards/rejected": -0.009611942805349827, "step": 10 }, { "epoch": 0.02, "learning_rate": 3.436426116838488e-08, "logits/chosen": 23.505186080932617, "logits/rejected": 23.52346420288086, "logps/chosen": -327.48468017578125, "logps/rejected": -279.432861328125, "loss": 0.6965, "rewards/accuracies": 0.4375, "rewards/chosen": -0.013154825195670128, "rewards/margins": -0.014362807385623455, "rewards/rejected": 0.0012079827720299363, "step": 20 }, { "epoch": 0.03, "learning_rate": 5.154639175257731e-08, "logits/chosen": 23.50873374938965, "logits/rejected": 23.2880859375, "logps/chosen": -340.9912109375, "logps/rejected": -269.15045166015625, "loss": 0.6955, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0022484897635877132, "rewards/margins": -0.017411604523658752, "rewards/rejected": 0.0196601003408432, "step": 30 }, { "epoch": 0.04, "learning_rate": 6.872852233676976e-08, "logits/chosen": 23.961822509765625, "logits/rejected": 23.730144500732422, "logps/chosen": -414.52447509765625, "logps/rejected": -300.4974670410156, "loss": 0.6961, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.003453383222222328, "rewards/margins": 0.017737122252583504, "rewards/rejected": -0.014283737167716026, "step": 40 }, { "epoch": 0.05, "learning_rate": 8.59106529209622e-08, "logits/chosen": 23.999908447265625, "logits/rejected": 23.47333335876465, "logps/chosen": -313.49395751953125, "logps/rejected": -216.2849578857422, "loss": 0.691, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.021781612187623978, "rewards/margins": 0.03288044035434723, "rewards/rejected": -0.011098823510110378, "step": 50 }, { "epoch": 0.06, "learning_rate": 1.0309278350515462e-07, "logits/chosen": 23.825542449951172, "logits/rejected": 23.716323852539062, "logps/chosen": -306.31744384765625, "logps/rejected": -260.7249755859375, "loss": 0.6916, "rewards/accuracies": 0.4375, "rewards/chosen": 0.009675316512584686, "rewards/margins": -0.021775808185338974, "rewards/rejected": 0.03145112842321396, "step": 60 }, { "epoch": 0.07, "learning_rate": 1.202749140893471e-07, "logits/chosen": 23.89028549194336, "logits/rejected": 23.66950798034668, "logps/chosen": -364.57757568359375, "logps/rejected": -250.9732208251953, "loss": 0.6871, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0638527050614357, "rewards/margins": 0.016006827354431152, "rewards/rejected": 0.047845881432294846, "step": 70 }, { "epoch": 0.08, "learning_rate": 1.3745704467353952e-07, "logits/chosen": 23.972980499267578, "logits/rejected": 23.702159881591797, "logps/chosen": -360.4600524902344, "logps/rejected": -277.17767333984375, "loss": 0.6826, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10171504318714142, "rewards/margins": 0.051059722900390625, "rewards/rejected": 0.05065532401204109, "step": 80 }, { "epoch": 0.09, "learning_rate": 1.5463917525773197e-07, "logits/chosen": 23.601802825927734, "logits/rejected": 23.44902229309082, "logps/chosen": -256.45306396484375, "logps/rejected": -228.2622528076172, "loss": 0.6742, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.13566820323467255, "rewards/margins": 0.05287040024995804, "rewards/rejected": 0.0827978178858757, "step": 90 }, { "epoch": 0.1, "learning_rate": 1.718213058419244e-07, "logits/chosen": 23.945114135742188, "logits/rejected": 23.670852661132812, "logps/chosen": -317.6385192871094, "logps/rejected": -238.4324188232422, "loss": 0.6639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14283855259418488, "rewards/margins": 0.07535254955291748, "rewards/rejected": 0.0674859955906868, "step": 100 }, { "epoch": 0.1, "eval_logits/chosen": 23.83555793762207, "eval_logits/rejected": 23.585235595703125, "eval_logps/chosen": -352.80859375, "eval_logps/rejected": -273.12677001953125, "eval_loss": 0.6592543125152588, "eval_rewards/accuracies": 0.6150793433189392, "eval_rewards/chosen": 0.17615097761154175, "eval_rewards/margins": 0.0804828330874443, "eval_rewards/rejected": 0.09566814452409744, "eval_runtime": 210.7096, "eval_samples_per_second": 9.492, "eval_steps_per_second": 0.299, "step": 100 }, { "epoch": 0.11, "learning_rate": 1.8900343642611682e-07, "logits/chosen": 23.709579467773438, "logits/rejected": 23.512853622436523, "logps/chosen": -349.40234375, "logps/rejected": -243.11532592773438, "loss": 0.6541, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16010603308677673, "rewards/margins": 0.09831614792346954, "rewards/rejected": 0.06178988143801689, "step": 110 }, { "epoch": 0.12, "learning_rate": 2.0618556701030925e-07, "logits/chosen": 23.544376373291016, "logits/rejected": 23.377239227294922, "logps/chosen": -341.64080810546875, "logps/rejected": -247.55844116210938, "loss": 0.6539, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.1492854803800583, "rewards/margins": 0.0526873879134655, "rewards/rejected": 0.09659810364246368, "step": 120 }, { "epoch": 0.13, "learning_rate": 2.2336769759450173e-07, "logits/chosen": 24.006563186645508, "logits/rejected": 23.8785457611084, "logps/chosen": -321.85467529296875, "logps/rejected": -281.0990905761719, "loss": 0.6401, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19155286252498627, "rewards/margins": 0.1119670420885086, "rewards/rejected": 0.07958582043647766, "step": 130 }, { "epoch": 0.14, "learning_rate": 2.405498281786942e-07, "logits/chosen": 23.71746826171875, "logits/rejected": 23.616607666015625, "logps/chosen": -346.86761474609375, "logps/rejected": -257.8626708984375, "loss": 0.6319, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.179647758603096, "rewards/margins": 0.20804457366466522, "rewards/rejected": -0.02839680388569832, "step": 140 }, { "epoch": 0.15, "learning_rate": 2.5773195876288655e-07, "logits/chosen": 23.601333618164062, "logits/rejected": 23.368152618408203, "logps/chosen": -342.10003662109375, "logps/rejected": -261.25201416015625, "loss": 0.6243, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.13529065251350403, "rewards/margins": 0.20980004966259003, "rewards/rejected": -0.0745093896985054, "step": 150 }, { "epoch": 0.17, "learning_rate": 2.7491408934707903e-07, "logits/chosen": 24.020530700683594, "logits/rejected": 23.818883895874023, "logps/chosen": -362.73968505859375, "logps/rejected": -253.7847137451172, "loss": 0.5915, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13591055572032928, "rewards/margins": 0.31857621669769287, "rewards/rejected": -0.1826656460762024, "step": 160 }, { "epoch": 0.18, "learning_rate": 2.9209621993127146e-07, "logits/chosen": 23.72347640991211, "logits/rejected": 23.625173568725586, "logps/chosen": -337.2410583496094, "logps/rejected": -265.833740234375, "loss": 0.5966, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.06029454618692398, "rewards/margins": 0.21368882060050964, "rewards/rejected": -0.15339429676532745, "step": 170 }, { "epoch": 0.19, "learning_rate": 3.0927835051546394e-07, "logits/chosen": 24.024005889892578, "logits/rejected": 23.694889068603516, "logps/chosen": -303.23358154296875, "logps/rejected": -259.80047607421875, "loss": 0.5912, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10565178096294403, "rewards/margins": 0.33361369371414185, "rewards/rejected": -0.2279619425535202, "step": 180 }, { "epoch": 0.2, "learning_rate": 3.2646048109965636e-07, "logits/chosen": 23.458202362060547, "logits/rejected": 23.41326904296875, "logps/chosen": -278.2962341308594, "logps/rejected": -242.08627319335938, "loss": 0.5826, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.05947988107800484, "rewards/margins": 0.3678347170352936, "rewards/rejected": -0.30835479497909546, "step": 190 }, { "epoch": 0.21, "learning_rate": 3.436426116838488e-07, "logits/chosen": 23.741714477539062, "logits/rejected": 23.483057022094727, "logps/chosen": -314.7781066894531, "logps/rejected": -248.27880859375, "loss": 0.5804, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1307556927204132, "rewards/margins": 0.3709767758846283, "rewards/rejected": -0.2402210682630539, "step": 200 }, { "epoch": 0.21, "eval_logits/chosen": 23.830230712890625, "eval_logits/rejected": 23.587175369262695, "eval_logps/chosen": -353.7904052734375, "eval_logps/rejected": -277.4797668457031, "eval_loss": 0.5836150646209717, "eval_rewards/accuracies": 0.6507936716079712, "eval_rewards/chosen": 0.07797454297542572, "eval_rewards/margins": 0.41760751605033875, "eval_rewards/rejected": -0.33963292837142944, "eval_runtime": 208.5861, "eval_samples_per_second": 9.588, "eval_steps_per_second": 0.302, "step": 200 }, { "epoch": 0.22, "learning_rate": 3.608247422680412e-07, "logits/chosen": 23.76480484008789, "logits/rejected": 23.56380271911621, "logps/chosen": -377.50799560546875, "logps/rejected": -279.08978271484375, "loss": 0.5611, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0905425176024437, "rewards/margins": 0.527503252029419, "rewards/rejected": -0.43696069717407227, "step": 210 }, { "epoch": 0.23, "learning_rate": 3.7800687285223364e-07, "logits/chosen": 23.482959747314453, "logits/rejected": 23.370895385742188, "logps/chosen": -316.96038818359375, "logps/rejected": -253.94686889648438, "loss": 0.5691, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16429784893989563, "rewards/margins": 0.4349435865879059, "rewards/rejected": -0.5992413759231567, "step": 220 }, { "epoch": 0.24, "learning_rate": 3.9518900343642607e-07, "logits/chosen": 23.473817825317383, "logits/rejected": 23.369760513305664, "logps/chosen": -334.98663330078125, "logps/rejected": -293.44854736328125, "loss": 0.5962, "rewards/accuracies": 0.625, "rewards/chosen": -0.1187012642621994, "rewards/margins": 0.38999611139297485, "rewards/rejected": -0.5086973905563354, "step": 230 }, { "epoch": 0.25, "learning_rate": 4.123711340206185e-07, "logits/chosen": 23.480493545532227, "logits/rejected": 23.42662239074707, "logps/chosen": -329.04595947265625, "logps/rejected": -243.5697784423828, "loss": 0.564, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.16112008690834045, "rewards/margins": 0.41859644651412964, "rewards/rejected": -0.5797165036201477, "step": 240 }, { "epoch": 0.26, "learning_rate": 4.2955326460481097e-07, "logits/chosen": 23.753459930419922, "logits/rejected": 23.624629974365234, "logps/chosen": -347.7720642089844, "logps/rejected": -273.23162841796875, "loss": 0.5833, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06806284189224243, "rewards/margins": 0.4797073304653168, "rewards/rejected": -0.5477702021598816, "step": 250 }, { "epoch": 0.27, "learning_rate": 4.4673539518900345e-07, "logits/chosen": 23.70407485961914, "logits/rejected": 23.5228328704834, "logps/chosen": -310.2815856933594, "logps/rejected": -250.3536376953125, "loss": 0.5718, "rewards/accuracies": 0.75, "rewards/chosen": -0.07945041358470917, "rewards/margins": 0.49080556631088257, "rewards/rejected": -0.5702559351921082, "step": 260 }, { "epoch": 0.28, "learning_rate": 4.639175257731959e-07, "logits/chosen": 23.76226234436035, "logits/rejected": 23.472620010375977, "logps/chosen": -301.5387268066406, "logps/rejected": -240.7628631591797, "loss": 0.601, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07512088119983673, "rewards/margins": 0.4631730914115906, "rewards/rejected": -0.5382939577102661, "step": 270 }, { "epoch": 0.29, "learning_rate": 4.810996563573884e-07, "logits/chosen": 23.984760284423828, "logits/rejected": 23.863937377929688, "logps/chosen": -373.2278137207031, "logps/rejected": -285.9132995605469, "loss": 0.5712, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10135757923126221, "rewards/margins": 0.6022639274597168, "rewards/rejected": -0.5009063482284546, "step": 280 }, { "epoch": 0.3, "learning_rate": 4.982817869415807e-07, "logits/chosen": 23.733274459838867, "logits/rejected": 23.508481979370117, "logps/chosen": -356.46099853515625, "logps/rejected": -259.97003173828125, "loss": 0.5751, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.18595895171165466, "rewards/margins": 0.5399104952812195, "rewards/rejected": -0.7258695363998413, "step": 290 }, { "epoch": 0.31, "learning_rate": 4.982778415614236e-07, "logits/chosen": 23.513275146484375, "logits/rejected": 23.471511840820312, "logps/chosen": -293.0979919433594, "logps/rejected": -249.67446899414062, "loss": 0.5815, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.22943711280822754, "rewards/margins": 0.6720036268234253, "rewards/rejected": -0.9014407396316528, "step": 300 }, { "epoch": 0.31, "eval_logits/chosen": 23.749773025512695, "eval_logits/rejected": 23.52240753173828, "eval_logps/chosen": -356.49285888671875, "eval_logps/rejected": -281.9402770996094, "eval_loss": 0.5510157942771912, "eval_rewards/accuracies": 0.7420634627342224, "eval_rewards/chosen": -0.19227494299411774, "eval_rewards/margins": 0.5934095978736877, "eval_rewards/rejected": -0.7856844663619995, "eval_runtime": 210.4467, "eval_samples_per_second": 9.504, "eval_steps_per_second": 0.299, "step": 300 }, { "epoch": 0.32, "learning_rate": 4.963643321852277e-07, "logits/chosen": 23.758647918701172, "logits/rejected": 23.599285125732422, "logps/chosen": -387.0029296875, "logps/rejected": -297.8297119140625, "loss": 0.5858, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22534582018852234, "rewards/margins": 0.4947783946990967, "rewards/rejected": -0.7201241254806519, "step": 310 }, { "epoch": 0.33, "learning_rate": 4.944508228090318e-07, "logits/chosen": 23.673627853393555, "logits/rejected": 23.470468521118164, "logps/chosen": -269.2679748535156, "logps/rejected": -209.1413116455078, "loss": 0.5428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2326889932155609, "rewards/margins": 0.49293145537376404, "rewards/rejected": -0.7256205677986145, "step": 320 }, { "epoch": 0.34, "learning_rate": 4.925373134328357e-07, "logits/chosen": 23.728256225585938, "logits/rejected": 23.57656478881836, "logps/chosen": -341.84552001953125, "logps/rejected": -279.85650634765625, "loss": 0.5848, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2053179293870926, "rewards/margins": 0.5053264498710632, "rewards/rejected": -0.7106443643569946, "step": 330 }, { "epoch": 0.35, "learning_rate": 4.906238040566398e-07, "logits/chosen": 23.395517349243164, "logits/rejected": 23.30283546447754, "logps/chosen": -276.00958251953125, "logps/rejected": -245.6515655517578, "loss": 0.5731, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4472281038761139, "rewards/margins": 0.5112749338150024, "rewards/rejected": -0.9585030674934387, "step": 340 }, { "epoch": 0.36, "learning_rate": 4.887102946804438e-07, "logits/chosen": 23.459369659423828, "logits/rejected": 23.258296966552734, "logps/chosen": -351.51153564453125, "logps/rejected": -265.9107666015625, "loss": 0.5436, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.248783141374588, "rewards/margins": 0.7760157585144043, "rewards/rejected": -1.02479887008667, "step": 350 }, { "epoch": 0.37, "learning_rate": 4.867967853042479e-07, "logits/chosen": 23.64513397216797, "logits/rejected": 23.49908447265625, "logps/chosen": -327.1449890136719, "logps/rejected": -301.5306396484375, "loss": 0.5287, "rewards/accuracies": 0.75, "rewards/chosen": -0.29137879610061646, "rewards/margins": 0.8306191563606262, "rewards/rejected": -1.1219979524612427, "step": 360 }, { "epoch": 0.38, "learning_rate": 4.84883275928052e-07, "logits/chosen": 23.72499656677246, "logits/rejected": 23.477428436279297, "logps/chosen": -337.2041320800781, "logps/rejected": -292.93463134765625, "loss": 0.5549, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.20421965420246124, "rewards/margins": 0.776501476764679, "rewards/rejected": -0.980721116065979, "step": 370 }, { "epoch": 0.39, "learning_rate": 4.82969766551856e-07, "logits/chosen": 23.811683654785156, "logits/rejected": 23.42571258544922, "logps/chosen": -364.2945251464844, "logps/rejected": -283.0462646484375, "loss": 0.5698, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.06022878363728523, "rewards/margins": 0.7983857989311218, "rewards/rejected": -0.8586145639419556, "step": 380 }, { "epoch": 0.4, "learning_rate": 4.810562571756601e-07, "logits/chosen": 23.39688491821289, "logits/rejected": 23.162023544311523, "logps/chosen": -323.4096984863281, "logps/rejected": -250.5354461669922, "loss": 0.5771, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1642606556415558, "rewards/margins": 0.8033970594406128, "rewards/rejected": -0.967657744884491, "step": 390 }, { "epoch": 0.41, "learning_rate": 4.791427477994642e-07, "logits/chosen": 23.463436126708984, "logits/rejected": 23.304988861083984, "logps/chosen": -290.4604797363281, "logps/rejected": -257.20977783203125, "loss": 0.5526, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2738291919231415, "rewards/margins": 0.6287984848022461, "rewards/rejected": -0.9026277661323547, "step": 400 }, { "epoch": 0.41, "eval_logits/chosen": 23.72638702392578, "eval_logits/rejected": 23.50330352783203, "eval_logps/chosen": -356.5235290527344, "eval_logps/rejected": -283.01190185546875, "eval_loss": 0.5360822081565857, "eval_rewards/accuracies": 0.7341269850730896, "eval_rewards/chosen": -0.19533830881118774, "eval_rewards/margins": 0.6975098848342896, "eval_rewards/rejected": -0.8928481936454773, "eval_runtime": 211.6561, "eval_samples_per_second": 9.449, "eval_steps_per_second": 0.298, "step": 400 }, { "epoch": 0.42, "learning_rate": 4.772292384232682e-07, "logits/chosen": 23.651836395263672, "logits/rejected": 23.562541961669922, "logps/chosen": -295.5309143066406, "logps/rejected": -256.42864990234375, "loss": 0.5646, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3951832056045532, "rewards/margins": 0.48603707551956177, "rewards/rejected": -0.8812202215194702, "step": 410 }, { "epoch": 0.43, "learning_rate": 4.753157290470723e-07, "logits/chosen": 23.555591583251953, "logits/rejected": 23.477405548095703, "logps/chosen": -291.4106140136719, "logps/rejected": -254.1300048828125, "loss": 0.5647, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.44952210783958435, "rewards/margins": 0.4881154000759125, "rewards/rejected": -0.937637448310852, "step": 420 }, { "epoch": 0.44, "learning_rate": 4.7340221967087635e-07, "logits/chosen": 23.740278244018555, "logits/rejected": 23.43073844909668, "logps/chosen": -283.1187438964844, "logps/rejected": -268.111083984375, "loss": 0.5611, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4848947525024414, "rewards/margins": 0.5371454954147339, "rewards/rejected": -1.0220401287078857, "step": 430 }, { "epoch": 0.45, "learning_rate": 4.714887102946804e-07, "logits/chosen": 23.772052764892578, "logits/rejected": 23.574148178100586, "logps/chosen": -316.62310791015625, "logps/rejected": -249.21389770507812, "loss": 0.5237, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24736304581165314, "rewards/margins": 0.6857331395149231, "rewards/rejected": -0.9330962300300598, "step": 440 }, { "epoch": 0.46, "learning_rate": 4.6957520091848447e-07, "logits/chosen": 23.818843841552734, "logits/rejected": 23.663349151611328, "logps/chosen": -301.2689208984375, "logps/rejected": -274.0567932128906, "loss": 0.5833, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.04559071734547615, "rewards/margins": 0.5639020800590515, "rewards/rejected": -0.6094927191734314, "step": 450 }, { "epoch": 0.47, "learning_rate": 4.6766169154228853e-07, "logits/chosen": 23.291194915771484, "logits/rejected": 23.422870635986328, "logps/chosen": -323.94488525390625, "logps/rejected": -233.00833129882812, "loss": 0.5194, "rewards/accuracies": 0.75, "rewards/chosen": -0.03107648529112339, "rewards/margins": 0.7044192552566528, "rewards/rejected": -0.7354957461357117, "step": 460 }, { "epoch": 0.49, "learning_rate": 4.657481821660926e-07, "logits/chosen": 23.393449783325195, "logits/rejected": 23.201961517333984, "logps/chosen": -318.3015441894531, "logps/rejected": -219.90170288085938, "loss": 0.5072, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.05152938514947891, "rewards/margins": 0.8564150929450989, "rewards/rejected": -0.90794438123703, "step": 470 }, { "epoch": 0.5, "learning_rate": 4.6383467278989666e-07, "logits/chosen": 23.315677642822266, "logits/rejected": 23.30160903930664, "logps/chosen": -354.28302001953125, "logps/rejected": -268.2124938964844, "loss": 0.5397, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.004499013535678387, "rewards/margins": 0.9241636395454407, "rewards/rejected": -0.9196645617485046, "step": 480 }, { "epoch": 0.51, "learning_rate": 4.6192116341370067e-07, "logits/chosen": 23.65988540649414, "logits/rejected": 23.262027740478516, "logps/chosen": -363.83416748046875, "logps/rejected": -269.9544677734375, "loss": 0.5463, "rewards/accuracies": 0.75, "rewards/chosen": -0.03792769834399223, "rewards/margins": 0.8194522857666016, "rewards/rejected": -0.8573800325393677, "step": 490 }, { "epoch": 0.52, "learning_rate": 4.6000765403750473e-07, "logits/chosen": 23.21782684326172, "logits/rejected": 22.95124053955078, "logps/chosen": -272.1101379394531, "logps/rejected": -233.3650665283203, "loss": 0.5225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.18960613012313843, "rewards/margins": 0.5189381837844849, "rewards/rejected": -0.7085443139076233, "step": 500 }, { "epoch": 0.52, "eval_logits/chosen": 23.67182731628418, "eval_logits/rejected": 23.457815170288086, "eval_logps/chosen": -355.6113586425781, "eval_logps/rejected": -282.89288330078125, "eval_loss": 0.5261635184288025, "eval_rewards/accuracies": 0.7539682388305664, "eval_rewards/chosen": -0.10412228107452393, "eval_rewards/margins": 0.7768236994743347, "eval_rewards/rejected": -0.8809459805488586, "eval_runtime": 208.2947, "eval_samples_per_second": 9.602, "eval_steps_per_second": 0.302, "step": 500 }, { "epoch": 0.53, "learning_rate": 4.580941446613088e-07, "logits/chosen": 23.604022979736328, "logits/rejected": 23.44409942626953, "logps/chosen": -326.31378173828125, "logps/rejected": -279.2933349609375, "loss": 0.5379, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2937595248222351, "rewards/margins": 0.6278744339942932, "rewards/rejected": -0.9216337203979492, "step": 510 }, { "epoch": 0.54, "learning_rate": 4.5618063528511285e-07, "logits/chosen": 23.713848114013672, "logits/rejected": 23.53582000732422, "logps/chosen": -304.9338684082031, "logps/rejected": -268.5104064941406, "loss": 0.5433, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.10584266483783722, "rewards/margins": 0.8640462160110474, "rewards/rejected": -0.9698888063430786, "step": 520 }, { "epoch": 0.55, "learning_rate": 4.542671259089169e-07, "logits/chosen": 23.50804328918457, "logits/rejected": 23.286922454833984, "logps/chosen": -291.4549560546875, "logps/rejected": -222.6033935546875, "loss": 0.553, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3200764060020447, "rewards/margins": 0.6575796008110046, "rewards/rejected": -0.9776560068130493, "step": 530 }, { "epoch": 0.56, "learning_rate": 4.52353616532721e-07, "logits/chosen": 23.716320037841797, "logits/rejected": 23.562469482421875, "logps/chosen": -322.17047119140625, "logps/rejected": -258.58917236328125, "loss": 0.5491, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1657361090183258, "rewards/margins": 0.7579048871994019, "rewards/rejected": -0.9236409068107605, "step": 540 }, { "epoch": 0.57, "learning_rate": 4.5044010715652504e-07, "logits/chosen": 23.58610725402832, "logits/rejected": 23.374378204345703, "logps/chosen": -303.7857971191406, "logps/rejected": -266.2262878417969, "loss": 0.5447, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1886710226535797, "rewards/margins": 0.5424461364746094, "rewards/rejected": -0.7311171293258667, "step": 550 }, { "epoch": 0.58, "learning_rate": 4.485265977803291e-07, "logits/chosen": 23.34024429321289, "logits/rejected": 23.08355140686035, "logps/chosen": -346.318603515625, "logps/rejected": -288.74432373046875, "loss": 0.5315, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1655091792345047, "rewards/margins": 0.8151466250419617, "rewards/rejected": -0.98065584897995, "step": 560 }, { "epoch": 0.59, "learning_rate": 4.4661308840413316e-07, "logits/chosen": 23.876493453979492, "logits/rejected": 23.629627227783203, "logps/chosen": -300.15509033203125, "logps/rejected": -273.7672424316406, "loss": 0.5197, "rewards/accuracies": 0.75, "rewards/chosen": -0.30857834219932556, "rewards/margins": 0.6672872304916382, "rewards/rejected": -0.9758656620979309, "step": 570 }, { "epoch": 0.6, "learning_rate": 4.446995790279372e-07, "logits/chosen": 23.596471786499023, "logits/rejected": 23.427684783935547, "logps/chosen": -334.6555480957031, "logps/rejected": -264.3316345214844, "loss": 0.5231, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2455914467573166, "rewards/margins": 0.883080005645752, "rewards/rejected": -1.128671407699585, "step": 580 }, { "epoch": 0.61, "learning_rate": 4.4278606965174123e-07, "logits/chosen": 23.314987182617188, "logits/rejected": 23.11943817138672, "logps/chosen": -298.7581787109375, "logps/rejected": -246.3135223388672, "loss": 0.5253, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3651345372200012, "rewards/margins": 0.677712619304657, "rewards/rejected": -1.0428470373153687, "step": 590 }, { "epoch": 0.62, "learning_rate": 4.408725602755453e-07, "logits/chosen": 23.37234115600586, "logits/rejected": 23.340373992919922, "logps/chosen": -286.86737060546875, "logps/rejected": -220.9873504638672, "loss": 0.5577, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4736716151237488, "rewards/margins": 0.5933648347854614, "rewards/rejected": -1.0670363903045654, "step": 600 }, { "epoch": 0.62, "eval_logits/chosen": 23.661834716796875, "eval_logits/rejected": 23.4466495513916, "eval_logps/chosen": -356.5157775878906, "eval_logps/rejected": -284.3682861328125, "eval_loss": 0.5155950784683228, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": -0.19456443190574646, "eval_rewards/margins": 0.8339203000068665, "eval_rewards/rejected": -1.02848482131958, "eval_runtime": 211.7272, "eval_samples_per_second": 9.446, "eval_steps_per_second": 0.298, "step": 600 }, { "epoch": 0.63, "learning_rate": 4.3895905089934936e-07, "logits/chosen": 23.545116424560547, "logits/rejected": 23.458499908447266, "logps/chosen": -338.8705139160156, "logps/rejected": -272.00714111328125, "loss": 0.5331, "rewards/accuracies": 0.75, "rewards/chosen": -0.2067803144454956, "rewards/margins": 0.8310182690620422, "rewards/rejected": -1.037798523902893, "step": 610 }, { "epoch": 0.64, "learning_rate": 4.370455415231534e-07, "logits/chosen": 23.625316619873047, "logits/rejected": 23.448591232299805, "logps/chosen": -345.89208984375, "logps/rejected": -306.5247497558594, "loss": 0.5182, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.30394795536994934, "rewards/margins": 0.5751500725746155, "rewards/rejected": -0.8790979385375977, "step": 620 }, { "epoch": 0.65, "learning_rate": 4.351320321469575e-07, "logits/chosen": 22.87206268310547, "logits/rejected": 22.757465362548828, "logps/chosen": -309.4687805175781, "logps/rejected": -291.12847900390625, "loss": 0.5417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.20515112578868866, "rewards/margins": 0.6493626236915588, "rewards/rejected": -0.8545138239860535, "step": 630 }, { "epoch": 0.66, "learning_rate": 4.3321852277076154e-07, "logits/chosen": 23.304141998291016, "logits/rejected": 23.251794815063477, "logps/chosen": -333.1667785644531, "logps/rejected": -272.1311950683594, "loss": 0.5271, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.41801586747169495, "rewards/margins": 0.5841894149780273, "rewards/rejected": -1.0022052526474, "step": 640 }, { "epoch": 0.67, "learning_rate": 4.313050133945656e-07, "logits/chosen": 23.594745635986328, "logits/rejected": 23.500207901000977, "logps/chosen": -357.5419616699219, "logps/rejected": -274.1604309082031, "loss": 0.5192, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.20376773178577423, "rewards/margins": 0.8759373426437378, "rewards/rejected": -1.0797051191329956, "step": 650 }, { "epoch": 0.68, "learning_rate": 4.2939150401836967e-07, "logits/chosen": 23.767133712768555, "logits/rejected": 23.464405059814453, "logps/chosen": -308.720458984375, "logps/rejected": -290.14306640625, "loss": 0.5137, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.28276339173316956, "rewards/margins": 0.6749929189682007, "rewards/rejected": -0.9577562212944031, "step": 660 }, { "epoch": 0.69, "learning_rate": 4.2747799464217373e-07, "logits/chosen": 23.170560836791992, "logits/rejected": 23.10344886779785, "logps/chosen": -350.4610595703125, "logps/rejected": -267.5567932128906, "loss": 0.5273, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.439180463552475, "rewards/margins": 0.6723843216896057, "rewards/rejected": -1.1115647554397583, "step": 670 }, { "epoch": 0.7, "learning_rate": 4.255644852659778e-07, "logits/chosen": 23.414445877075195, "logits/rejected": 23.61502456665039, "logps/chosen": -374.15692138671875, "logps/rejected": -288.3631286621094, "loss": 0.5761, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08570393174886703, "rewards/margins": 0.6574904918670654, "rewards/rejected": -0.7431942820549011, "step": 680 }, { "epoch": 0.71, "learning_rate": 4.236509758897818e-07, "logits/chosen": 23.552722930908203, "logits/rejected": 23.40909194946289, "logps/chosen": -342.1145324707031, "logps/rejected": -264.7322082519531, "loss": 0.5549, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16522836685180664, "rewards/margins": 0.6978949904441833, "rewards/rejected": -0.8631232976913452, "step": 690 }, { "epoch": 0.72, "learning_rate": 4.2173746651358586e-07, "logits/chosen": 23.72678565979004, "logits/rejected": 23.425289154052734, "logps/chosen": -331.14410400390625, "logps/rejected": -285.4593200683594, "loss": 0.5515, "rewards/accuracies": 0.75, "rewards/chosen": -0.10031759738922119, "rewards/margins": 0.6387730836868286, "rewards/rejected": -0.739090621471405, "step": 700 }, { "epoch": 0.72, "eval_logits/chosen": 23.634296417236328, "eval_logits/rejected": 23.424331665039062, "eval_logps/chosen": -353.9219665527344, "eval_logps/rejected": -281.7333679199219, "eval_loss": 0.5162664651870728, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": 0.0648159608244896, "eval_rewards/margins": 0.8298115730285645, "eval_rewards/rejected": -0.7649956345558167, "eval_runtime": 211.7482, "eval_samples_per_second": 9.445, "eval_steps_per_second": 0.298, "step": 700 }, { "epoch": 0.73, "learning_rate": 4.198239571373899e-07, "logits/chosen": 23.546445846557617, "logits/rejected": 23.238723754882812, "logps/chosen": -307.0013122558594, "logps/rejected": -247.3063201904297, "loss": 0.5341, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08633746951818466, "rewards/margins": 0.7773478031158447, "rewards/rejected": -0.8636852502822876, "step": 710 }, { "epoch": 0.74, "learning_rate": 4.17910447761194e-07, "logits/chosen": 23.390674591064453, "logits/rejected": 23.3981876373291, "logps/chosen": -337.4337158203125, "logps/rejected": -311.81414794921875, "loss": 0.5774, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.21913309395313263, "rewards/margins": 0.4323801100254059, "rewards/rejected": -0.6515131592750549, "step": 720 }, { "epoch": 0.75, "learning_rate": 4.1599693838499805e-07, "logits/chosen": 23.517433166503906, "logits/rejected": 23.37581443786621, "logps/chosen": -291.6369323730469, "logps/rejected": -265.741943359375, "loss": 0.5377, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.25872206687927246, "rewards/margins": 0.6349440813064575, "rewards/rejected": -0.8936660885810852, "step": 730 }, { "epoch": 0.76, "learning_rate": 4.140834290088021e-07, "logits/chosen": 23.460777282714844, "logits/rejected": 23.235326766967773, "logps/chosen": -339.9285888671875, "logps/rejected": -266.0412292480469, "loss": 0.5197, "rewards/accuracies": 0.75, "rewards/chosen": -0.11111575365066528, "rewards/margins": 0.9949262738227844, "rewards/rejected": -1.1060421466827393, "step": 740 }, { "epoch": 0.77, "learning_rate": 4.121699196326062e-07, "logits/chosen": 23.193099975585938, "logits/rejected": 23.17205238342285, "logps/chosen": -333.4886474609375, "logps/rejected": -274.37274169921875, "loss": 0.5358, "rewards/accuracies": 0.6875, "rewards/chosen": -0.505832850933075, "rewards/margins": 0.6223492622375488, "rewards/rejected": -1.1281821727752686, "step": 750 }, { "epoch": 0.78, "learning_rate": 4.1025641025641024e-07, "logits/chosen": 23.626953125, "logits/rejected": 23.612979888916016, "logps/chosen": -327.89111328125, "logps/rejected": -297.7337341308594, "loss": 0.5246, "rewards/accuracies": 0.75, "rewards/chosen": -0.35025396943092346, "rewards/margins": 0.8313090205192566, "rewards/rejected": -1.1815630197525024, "step": 760 }, { "epoch": 0.8, "learning_rate": 4.083429008802143e-07, "logits/chosen": 23.58197593688965, "logits/rejected": 23.45255470275879, "logps/chosen": -272.42156982421875, "logps/rejected": -270.8283386230469, "loss": 0.5165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3132372796535492, "rewards/margins": 0.5522381663322449, "rewards/rejected": -0.8654754757881165, "step": 770 }, { "epoch": 0.81, "learning_rate": 4.0642939150401836e-07, "logits/chosen": 23.23889923095703, "logits/rejected": 23.24991798400879, "logps/chosen": -314.5959167480469, "logps/rejected": -257.6392517089844, "loss": 0.534, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4284973740577698, "rewards/margins": 0.6584367156028748, "rewards/rejected": -1.086934208869934, "step": 780 }, { "epoch": 0.82, "learning_rate": 4.0451588212782237e-07, "logits/chosen": 23.414813995361328, "logits/rejected": 23.380718231201172, "logps/chosen": -291.32501220703125, "logps/rejected": -256.01263427734375, "loss": 0.4937, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.18692317605018616, "rewards/margins": 0.8645000457763672, "rewards/rejected": -1.051423192024231, "step": 790 }, { "epoch": 0.83, "learning_rate": 4.0260237275162643e-07, "logits/chosen": 23.40145492553711, "logits/rejected": 23.43955421447754, "logps/chosen": -313.07513427734375, "logps/rejected": -277.38201904296875, "loss": 0.5159, "rewards/accuracies": 0.75, "rewards/chosen": -0.11286661773920059, "rewards/margins": 0.8762611150741577, "rewards/rejected": -0.9891278147697449, "step": 800 }, { "epoch": 0.83, "eval_logits/chosen": 23.617877960205078, "eval_logits/rejected": 23.40951156616211, "eval_logps/chosen": -355.9697570800781, "eval_logps/rejected": -284.6782531738281, "eval_loss": 0.5112624764442444, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.1399604231119156, "eval_rewards/margins": 0.9195234179496765, "eval_rewards/rejected": -1.0594837665557861, "eval_runtime": 211.1679, "eval_samples_per_second": 9.471, "eval_steps_per_second": 0.298, "step": 800 }, { "epoch": 0.84, "learning_rate": 4.006888633754305e-07, "logits/chosen": 23.594928741455078, "logits/rejected": 23.54049301147461, "logps/chosen": -319.88677978515625, "logps/rejected": -260.43389892578125, "loss": 0.4905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20980267226696014, "rewards/margins": 0.9086447954177856, "rewards/rejected": -1.1184475421905518, "step": 810 }, { "epoch": 0.85, "learning_rate": 3.9877535399923456e-07, "logits/chosen": 23.482894897460938, "logits/rejected": 23.19647216796875, "logps/chosen": -338.23223876953125, "logps/rejected": -269.0614929199219, "loss": 0.5256, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4344770014286041, "rewards/margins": 0.710912823677063, "rewards/rejected": -1.1453897953033447, "step": 820 }, { "epoch": 0.86, "learning_rate": 3.968618446230386e-07, "logits/chosen": 23.347646713256836, "logits/rejected": 23.12314224243164, "logps/chosen": -311.5711364746094, "logps/rejected": -240.50125122070312, "loss": 0.4916, "rewards/accuracies": 0.75, "rewards/chosen": -0.3820451498031616, "rewards/margins": 1.0347092151641846, "rewards/rejected": -1.4167543649673462, "step": 830 }, { "epoch": 0.87, "learning_rate": 3.949483352468427e-07, "logits/chosen": 23.311033248901367, "logits/rejected": 23.248620986938477, "logps/chosen": -281.490966796875, "logps/rejected": -240.92086791992188, "loss": 0.556, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.47001034021377563, "rewards/margins": 0.762096643447876, "rewards/rejected": -1.2321069240570068, "step": 840 }, { "epoch": 0.88, "learning_rate": 3.9303482587064674e-07, "logits/chosen": 23.50173568725586, "logits/rejected": 23.377094268798828, "logps/chosen": -290.4707336425781, "logps/rejected": -248.4992218017578, "loss": 0.5133, "rewards/accuracies": 0.75, "rewards/chosen": -0.25379228591918945, "rewards/margins": 0.8668729662895203, "rewards/rejected": -1.1206653118133545, "step": 850 }, { "epoch": 0.89, "learning_rate": 3.911213164944508e-07, "logits/chosen": 23.733707427978516, "logits/rejected": 23.433372497558594, "logps/chosen": -346.18353271484375, "logps/rejected": -291.7870788574219, "loss": 0.5163, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3051421046257019, "rewards/margins": 0.9217368364334106, "rewards/rejected": -1.2268788814544678, "step": 860 }, { "epoch": 0.9, "learning_rate": 3.8920780711825487e-07, "logits/chosen": 23.664628982543945, "logits/rejected": 23.414520263671875, "logps/chosen": -396.57269287109375, "logps/rejected": -270.23681640625, "loss": 0.5127, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.19412247836589813, "rewards/margins": 0.9694843292236328, "rewards/rejected": -1.163606882095337, "step": 870 }, { "epoch": 0.91, "learning_rate": 3.8729429774205893e-07, "logits/chosen": 23.33928680419922, "logits/rejected": 23.3987979888916, "logps/chosen": -381.9424133300781, "logps/rejected": -267.4436340332031, "loss": 0.5481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1822233498096466, "rewards/margins": 1.0088589191436768, "rewards/rejected": -1.191082239151001, "step": 880 }, { "epoch": 0.92, "learning_rate": 3.8538078836586294e-07, "logits/chosen": 23.547710418701172, "logits/rejected": 23.508426666259766, "logps/chosen": -332.8504638671875, "logps/rejected": -287.86712646484375, "loss": 0.5454, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.26958388090133667, "rewards/margins": 0.755517840385437, "rewards/rejected": -1.025101661682129, "step": 890 }, { "epoch": 0.93, "learning_rate": 3.83467278989667e-07, "logits/chosen": 23.69491195678711, "logits/rejected": 23.59137725830078, "logps/chosen": -287.8290100097656, "logps/rejected": -236.21438598632812, "loss": 0.5242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07682293653488159, "rewards/margins": 0.8297051191329956, "rewards/rejected": -0.906528115272522, "step": 900 }, { "epoch": 0.93, "eval_logits/chosen": 23.614517211914062, "eval_logits/rejected": 23.403518676757812, "eval_logps/chosen": -354.952880859375, "eval_logps/rejected": -283.23175048828125, "eval_loss": 0.5089067220687866, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": -0.03827480971813202, "eval_rewards/margins": 0.8765569925308228, "eval_rewards/rejected": -0.914831817150116, "eval_runtime": 210.9611, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.299, "step": 900 }, { "epoch": 0.94, "learning_rate": 3.8155376961347106e-07, "logits/chosen": 23.2309627532959, "logits/rejected": 23.20724105834961, "logps/chosen": -246.0962677001953, "logps/rejected": -230.4759979248047, "loss": 0.5286, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.24791307747364044, "rewards/margins": 0.7959302663803101, "rewards/rejected": -1.0438432693481445, "step": 910 }, { "epoch": 0.95, "learning_rate": 3.796402602372751e-07, "logits/chosen": 23.323627471923828, "logits/rejected": 23.179901123046875, "logps/chosen": -294.0438537597656, "logps/rejected": -236.05673217773438, "loss": 0.5097, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.21511948108673096, "rewards/margins": 0.6154388189315796, "rewards/rejected": -0.8305583000183105, "step": 920 }, { "epoch": 0.96, "learning_rate": 3.777267508610792e-07, "logits/chosen": 23.346338272094727, "logits/rejected": 23.224199295043945, "logps/chosen": -318.80096435546875, "logps/rejected": -251.92593383789062, "loss": 0.5228, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1629999428987503, "rewards/margins": 0.8454955816268921, "rewards/rejected": -1.008495569229126, "step": 930 }, { "epoch": 0.97, "learning_rate": 3.7581324148488325e-07, "logits/chosen": 23.233306884765625, "logits/rejected": 23.179094314575195, "logps/chosen": -330.75469970703125, "logps/rejected": -246.1700897216797, "loss": 0.5374, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.317868173122406, "rewards/margins": 0.8588002920150757, "rewards/rejected": -1.176668405532837, "step": 940 }, { "epoch": 0.98, "learning_rate": 3.738997321086873e-07, "logits/chosen": 23.4466495513916, "logits/rejected": 23.421428680419922, "logps/chosen": -325.29388427734375, "logps/rejected": -277.3059387207031, "loss": 0.5251, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012738706544041634, "rewards/margins": 0.8846112489700317, "rewards/rejected": -0.8973498344421387, "step": 950 }, { "epoch": 0.99, "learning_rate": 3.7198622273249137e-07, "logits/chosen": 23.535139083862305, "logits/rejected": 23.37562370300293, "logps/chosen": -335.9436340332031, "logps/rejected": -274.2239990234375, "loss": 0.5143, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1785895973443985, "rewards/margins": 0.5961320400238037, "rewards/rejected": -0.7747215628623962, "step": 960 }, { "epoch": 1.0, "learning_rate": 3.7007271335629544e-07, "logits/chosen": 23.633747100830078, "logits/rejected": 23.46231460571289, "logps/chosen": -295.21759033203125, "logps/rejected": -258.9006042480469, "loss": 0.4828, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.19572855532169342, "rewards/margins": 0.8321182131767273, "rewards/rejected": -1.0278469324111938, "step": 970 }, { "epoch": 1.01, "learning_rate": 3.681592039800995e-07, "logits/chosen": 23.68822479248047, "logits/rejected": 23.501148223876953, "logps/chosen": -299.13140869140625, "logps/rejected": -279.1519470214844, "loss": 0.4402, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.023116961121559143, "rewards/margins": 1.1388248205184937, "rewards/rejected": -1.1157079935073853, "step": 980 }, { "epoch": 1.02, "learning_rate": 3.662456946039035e-07, "logits/chosen": 23.261262893676758, "logits/rejected": 22.952524185180664, "logps/chosen": -308.18536376953125, "logps/rejected": -273.8042907714844, "loss": 0.4855, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0076486109755933285, "rewards/margins": 1.2322968244552612, "rewards/rejected": -1.2399452924728394, "step": 990 }, { "epoch": 1.03, "learning_rate": 3.6433218522770757e-07, "logits/chosen": 23.609331130981445, "logits/rejected": 23.460206985473633, "logps/chosen": -298.27410888671875, "logps/rejected": -291.9101867675781, "loss": 0.4618, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07892550528049469, "rewards/margins": 0.973551869392395, "rewards/rejected": -1.0524773597717285, "step": 1000 }, { "epoch": 1.03, "eval_logits/chosen": 23.585590362548828, "eval_logits/rejected": 23.38045883178711, "eval_logps/chosen": -355.79290771484375, "eval_logps/rejected": -284.2840881347656, "eval_loss": 0.5076952576637268, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.12227805703878403, "eval_rewards/margins": 0.8977885842323303, "eval_rewards/rejected": -1.020066499710083, "eval_runtime": 209.3271, "eval_samples_per_second": 9.554, "eval_steps_per_second": 0.301, "step": 1000 }, { "epoch": 1.04, "learning_rate": 3.6241867585151163e-07, "logits/chosen": 23.0650577545166, "logits/rejected": 22.977046966552734, "logps/chosen": -336.06597900390625, "logps/rejected": -278.80828857421875, "loss": 0.4487, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.024078911170363426, "rewards/margins": 1.0799994468688965, "rewards/rejected": -1.1040784120559692, "step": 1010 }, { "epoch": 1.05, "learning_rate": 3.605051664753157e-07, "logits/chosen": 23.251041412353516, "logits/rejected": 23.117984771728516, "logps/chosen": -303.11199951171875, "logps/rejected": -243.7981719970703, "loss": 0.415, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18380531668663025, "rewards/margins": 1.1280765533447266, "rewards/rejected": -1.3118817806243896, "step": 1020 }, { "epoch": 1.06, "learning_rate": 3.5859165709911975e-07, "logits/chosen": 23.297225952148438, "logits/rejected": 23.318119049072266, "logps/chosen": -334.6638488769531, "logps/rejected": -316.2551574707031, "loss": 0.4302, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2101324051618576, "rewards/margins": 0.9589886665344238, "rewards/rejected": -1.1691210269927979, "step": 1030 }, { "epoch": 1.07, "learning_rate": 3.566781477229238e-07, "logits/chosen": 23.525909423828125, "logits/rejected": 23.159460067749023, "logps/chosen": -318.52093505859375, "logps/rejected": -268.981201171875, "loss": 0.4484, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.31944024562835693, "rewards/margins": 0.7762435674667358, "rewards/rejected": -1.0956838130950928, "step": 1040 }, { "epoch": 1.08, "learning_rate": 3.547646383467279e-07, "logits/chosen": 23.090410232543945, "logits/rejected": 23.27143669128418, "logps/chosen": -314.944580078125, "logps/rejected": -246.20974731445312, "loss": 0.4362, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.34100785851478577, "rewards/margins": 0.8710842132568359, "rewards/rejected": -1.2120921611785889, "step": 1050 }, { "epoch": 1.09, "learning_rate": 3.5285112897053194e-07, "logits/chosen": 23.36246681213379, "logits/rejected": 23.223459243774414, "logps/chosen": -286.5093078613281, "logps/rejected": -283.33514404296875, "loss": 0.4188, "rewards/accuracies": 0.875, "rewards/chosen": 0.0016206980217248201, "rewards/margins": 1.3344464302062988, "rewards/rejected": -1.3328258991241455, "step": 1060 }, { "epoch": 1.1, "learning_rate": 3.50937619594336e-07, "logits/chosen": 23.439857482910156, "logits/rejected": 23.41635513305664, "logps/chosen": -319.5511169433594, "logps/rejected": -310.8269348144531, "loss": 0.4497, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.29633527994155884, "rewards/margins": 0.8939793705940247, "rewards/rejected": -1.190314531326294, "step": 1070 }, { "epoch": 1.12, "learning_rate": 3.4902411021814007e-07, "logits/chosen": 23.304676055908203, "logits/rejected": 23.236148834228516, "logps/chosen": -329.2352600097656, "logps/rejected": -261.53082275390625, "loss": 0.4335, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15099604427814484, "rewards/margins": 1.2369451522827148, "rewards/rejected": -1.3879411220550537, "step": 1080 }, { "epoch": 1.13, "learning_rate": 3.4711060084194413e-07, "logits/chosen": 23.37632179260254, "logits/rejected": 23.350784301757812, "logps/chosen": -360.5380859375, "logps/rejected": -272.3520812988281, "loss": 0.442, "rewards/accuracies": 0.8125, "rewards/chosen": 0.015167620964348316, "rewards/margins": 1.156204342842102, "rewards/rejected": -1.1410366296768188, "step": 1090 }, { "epoch": 1.14, "learning_rate": 3.4519709146574814e-07, "logits/chosen": 23.032573699951172, "logits/rejected": 22.9952449798584, "logps/chosen": -225.4923858642578, "logps/rejected": -196.6672821044922, "loss": 0.4484, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4950261116027832, "rewards/margins": 0.9079095721244812, "rewards/rejected": -1.4029356241226196, "step": 1100 }, { "epoch": 1.14, "eval_logits/chosen": 23.538101196289062, "eval_logits/rejected": 23.34269142150879, "eval_logps/chosen": -357.8807373046875, "eval_logps/rejected": -287.3826599121094, "eval_loss": 0.5019155144691467, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.3310595154762268, "eval_rewards/margins": 0.9988633990287781, "eval_rewards/rejected": -1.3299229145050049, "eval_runtime": 210.9987, "eval_samples_per_second": 9.479, "eval_steps_per_second": 0.299, "step": 1100 }, { "epoch": 1.15, "learning_rate": 3.432835820895522e-07, "logits/chosen": 23.460542678833008, "logits/rejected": 23.26938247680664, "logps/chosen": -358.67877197265625, "logps/rejected": -289.0791931152344, "loss": 0.4235, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.26543277502059937, "rewards/margins": 1.2576963901519775, "rewards/rejected": -1.5231291055679321, "step": 1110 }, { "epoch": 1.16, "learning_rate": 3.4137007271335626e-07, "logits/chosen": 23.63456916809082, "logits/rejected": 23.502344131469727, "logps/chosen": -284.9480895996094, "logps/rejected": -279.4847412109375, "loss": 0.4245, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3471234440803528, "rewards/margins": 1.1683541536331177, "rewards/rejected": -1.5154775381088257, "step": 1120 }, { "epoch": 1.17, "learning_rate": 3.394565633371603e-07, "logits/chosen": 23.349411010742188, "logits/rejected": 23.35630226135254, "logps/chosen": -341.84881591796875, "logps/rejected": -319.86358642578125, "loss": 0.4209, "rewards/accuracies": 0.8125, "rewards/chosen": -0.29219168424606323, "rewards/margins": 1.1724560260772705, "rewards/rejected": -1.464647889137268, "step": 1130 }, { "epoch": 1.18, "learning_rate": 3.375430539609644e-07, "logits/chosen": 23.028972625732422, "logits/rejected": 23.105947494506836, "logps/chosen": -298.8593444824219, "logps/rejected": -308.6123046875, "loss": 0.4111, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.41705456376075745, "rewards/margins": 0.954562783241272, "rewards/rejected": -1.3716174364089966, "step": 1140 }, { "epoch": 1.19, "learning_rate": 3.3562954458476845e-07, "logits/chosen": 23.36569595336914, "logits/rejected": 23.174901962280273, "logps/chosen": -414.62567138671875, "logps/rejected": -282.6720275878906, "loss": 0.4634, "rewards/accuracies": 0.8125, "rewards/chosen": -0.30735766887664795, "rewards/margins": 1.2201875448226929, "rewards/rejected": -1.5275452136993408, "step": 1150 }, { "epoch": 1.2, "learning_rate": 3.337160352085725e-07, "logits/chosen": 23.13878059387207, "logits/rejected": 23.121612548828125, "logps/chosen": -331.0238342285156, "logps/rejected": -275.01129150390625, "loss": 0.4363, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.260793000459671, "rewards/margins": 1.1585649251937866, "rewards/rejected": -1.4193580150604248, "step": 1160 }, { "epoch": 1.21, "learning_rate": 3.3180252583237657e-07, "logits/chosen": 23.040502548217773, "logits/rejected": 22.87631607055664, "logps/chosen": -301.6041564941406, "logps/rejected": -246.01254272460938, "loss": 0.4526, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.19479836523532867, "rewards/margins": 1.1458656787872314, "rewards/rejected": -1.340664029121399, "step": 1170 }, { "epoch": 1.22, "learning_rate": 3.2988901645618063e-07, "logits/chosen": 23.21322250366211, "logits/rejected": 22.910724639892578, "logps/chosen": -270.7756652832031, "logps/rejected": -233.3585968017578, "loss": 0.4396, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4070712924003601, "rewards/margins": 0.8780719041824341, "rewards/rejected": -1.2851431369781494, "step": 1180 }, { "epoch": 1.23, "learning_rate": 3.279755070799847e-07, "logits/chosen": 23.265064239501953, "logits/rejected": 23.274433135986328, "logps/chosen": -315.3988037109375, "logps/rejected": -291.09375, "loss": 0.4049, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.24129147827625275, "rewards/margins": 1.1495540142059326, "rewards/rejected": -1.390845537185669, "step": 1190 }, { "epoch": 1.24, "learning_rate": 3.260619977037887e-07, "logits/chosen": 23.321971893310547, "logits/rejected": 23.265857696533203, "logps/chosen": -304.5440368652344, "logps/rejected": -283.64764404296875, "loss": 0.4228, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15949824452400208, "rewards/margins": 1.1103687286376953, "rewards/rejected": -1.269866704940796, "step": 1200 }, { "epoch": 1.24, "eval_logits/chosen": 23.51008415222168, "eval_logits/rejected": 23.319059371948242, "eval_logps/chosen": -355.1871337890625, "eval_logps/rejected": -285.07257080078125, "eval_loss": 0.5033829212188721, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": -0.06169680133461952, "eval_rewards/margins": 1.0372183322906494, "eval_rewards/rejected": -1.0989152193069458, "eval_runtime": 207.9261, "eval_samples_per_second": 9.619, "eval_steps_per_second": 0.303, "step": 1200 }, { "epoch": 1.25, "learning_rate": 3.2414848832759277e-07, "logits/chosen": 23.359235763549805, "logits/rejected": 23.241931915283203, "logps/chosen": -253.0906219482422, "logps/rejected": -248.0492706298828, "loss": 0.4028, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.09821876138448715, "rewards/margins": 1.3042573928833008, "rewards/rejected": -1.4024760723114014, "step": 1210 }, { "epoch": 1.26, "learning_rate": 3.2223497895139683e-07, "logits/chosen": 23.251922607421875, "logits/rejected": 23.224475860595703, "logps/chosen": -323.7980041503906, "logps/rejected": -312.51934814453125, "loss": 0.4376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.02340073511004448, "rewards/margins": 1.2130701541900635, "rewards/rejected": -1.2364708185195923, "step": 1220 }, { "epoch": 1.27, "learning_rate": 3.203214695752009e-07, "logits/chosen": 23.745332717895508, "logits/rejected": 23.615753173828125, "logps/chosen": -313.46453857421875, "logps/rejected": -308.4986877441406, "loss": 0.4391, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.15489891171455383, "rewards/margins": 0.8978082537651062, "rewards/rejected": -1.0527071952819824, "step": 1230 }, { "epoch": 1.28, "learning_rate": 3.1840796019900495e-07, "logits/chosen": 23.76753044128418, "logits/rejected": 23.41860580444336, "logps/chosen": -348.83258056640625, "logps/rejected": -300.46893310546875, "loss": 0.4562, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.1853850781917572, "rewards/margins": 1.198961853981018, "rewards/rejected": -1.3843467235565186, "step": 1240 }, { "epoch": 1.29, "learning_rate": 3.16494450822809e-07, "logits/chosen": 23.356491088867188, "logits/rejected": 23.26506996154785, "logps/chosen": -251.2193145751953, "logps/rejected": -268.14215087890625, "loss": 0.4412, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5262545347213745, "rewards/margins": 0.6762484312057495, "rewards/rejected": -1.202502965927124, "step": 1250 }, { "epoch": 1.3, "learning_rate": 3.145809414466131e-07, "logits/chosen": 23.517498016357422, "logits/rejected": 23.316844940185547, "logps/chosen": -342.25604248046875, "logps/rejected": -239.2180633544922, "loss": 0.4239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.42340001463890076, "rewards/margins": 1.1046512126922607, "rewards/rejected": -1.5280513763427734, "step": 1260 }, { "epoch": 1.31, "learning_rate": 3.1266743207041714e-07, "logits/chosen": 23.371051788330078, "logits/rejected": 23.382333755493164, "logps/chosen": -388.79425048828125, "logps/rejected": -311.9518127441406, "loss": 0.4435, "rewards/accuracies": 0.75, "rewards/chosen": -0.25299787521362305, "rewards/margins": 1.0895098447799683, "rewards/rejected": -1.3425077199935913, "step": 1270 }, { "epoch": 1.32, "learning_rate": 3.107539226942212e-07, "logits/chosen": 23.398571014404297, "logits/rejected": 23.297183990478516, "logps/chosen": -284.0433349609375, "logps/rejected": -269.79901123046875, "loss": 0.4612, "rewards/accuracies": 0.8125, "rewards/chosen": -0.34444212913513184, "rewards/margins": 1.0336390733718872, "rewards/rejected": -1.3780810832977295, "step": 1280 }, { "epoch": 1.33, "learning_rate": 3.0884041331802526e-07, "logits/chosen": 23.48178482055664, "logits/rejected": 23.24820899963379, "logps/chosen": -316.18109130859375, "logps/rejected": -268.31182861328125, "loss": 0.4396, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.26420363783836365, "rewards/margins": 1.2460224628448486, "rewards/rejected": -1.5102260112762451, "step": 1290 }, { "epoch": 1.34, "learning_rate": 3.0692690394182927e-07, "logits/chosen": 23.08510971069336, "logits/rejected": 22.929636001586914, "logps/chosen": -303.6866760253906, "logps/rejected": -280.0844421386719, "loss": 0.4306, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.46213406324386597, "rewards/margins": 1.072772741317749, "rewards/rejected": -1.5349067449569702, "step": 1300 }, { "epoch": 1.34, "eval_logits/chosen": 23.478702545166016, "eval_logits/rejected": 23.2889404296875, "eval_logps/chosen": -356.1548767089844, "eval_logps/rejected": -285.9320373535156, "eval_loss": 0.5032446384429932, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -0.15847428143024445, "eval_rewards/margins": 1.0263888835906982, "eval_rewards/rejected": -1.1848632097244263, "eval_runtime": 214.9168, "eval_samples_per_second": 9.306, "eval_steps_per_second": 0.293, "step": 1300 }, { "epoch": 1.35, "learning_rate": 3.0501339456563334e-07, "logits/chosen": 23.21750831604004, "logits/rejected": 23.04998016357422, "logps/chosen": -319.6648254394531, "logps/rejected": -272.9951477050781, "loss": 0.42, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.12266921997070312, "rewards/margins": 1.1443729400634766, "rewards/rejected": -1.2670420408248901, "step": 1310 }, { "epoch": 1.36, "learning_rate": 3.030998851894374e-07, "logits/chosen": 22.977802276611328, "logits/rejected": 22.969890594482422, "logps/chosen": -301.5118408203125, "logps/rejected": -249.35372924804688, "loss": 0.4142, "rewards/accuracies": 0.75, "rewards/chosen": -0.41769084334373474, "rewards/margins": 0.9436267614364624, "rewards/rejected": -1.3613176345825195, "step": 1320 }, { "epoch": 1.37, "learning_rate": 3.0118637581324146e-07, "logits/chosen": 23.275325775146484, "logits/rejected": 23.174297332763672, "logps/chosen": -339.08258056640625, "logps/rejected": -282.3678283691406, "loss": 0.422, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15730026364326477, "rewards/margins": 1.1286394596099854, "rewards/rejected": -1.2859396934509277, "step": 1330 }, { "epoch": 1.38, "learning_rate": 2.992728664370455e-07, "logits/chosen": 23.317642211914062, "logits/rejected": 23.297130584716797, "logps/chosen": -263.40313720703125, "logps/rejected": -259.40655517578125, "loss": 0.4378, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5026736259460449, "rewards/margins": 0.7866870760917664, "rewards/rejected": -1.289360761642456, "step": 1340 }, { "epoch": 1.39, "learning_rate": 2.973593570608496e-07, "logits/chosen": 23.30654525756836, "logits/rejected": 23.131305694580078, "logps/chosen": -299.3922424316406, "logps/rejected": -269.0783386230469, "loss": 0.4366, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.43080934882164, "rewards/margins": 1.0835515260696411, "rewards/rejected": -1.514360785484314, "step": 1350 }, { "epoch": 1.4, "learning_rate": 2.9544584768465365e-07, "logits/chosen": 23.53885269165039, "logits/rejected": 23.299760818481445, "logps/chosen": -324.5937194824219, "logps/rejected": -272.784912109375, "loss": 0.4427, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.40283799171447754, "rewards/margins": 1.1869771480560303, "rewards/rejected": -1.5898151397705078, "step": 1360 }, { "epoch": 1.41, "learning_rate": 2.935323383084577e-07, "logits/chosen": 23.592838287353516, "logits/rejected": 23.14777183532715, "logps/chosen": -392.72662353515625, "logps/rejected": -286.9781494140625, "loss": 0.4427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.39330539107322693, "rewards/margins": 1.0419480800628662, "rewards/rejected": -1.4352535009384155, "step": 1370 }, { "epoch": 1.42, "learning_rate": 2.9161882893226177e-07, "logits/chosen": 23.269372940063477, "logits/rejected": 22.95911979675293, "logps/chosen": -316.82574462890625, "logps/rejected": -261.1885070800781, "loss": 0.433, "rewards/accuracies": 0.8125, "rewards/chosen": -0.317082941532135, "rewards/margins": 1.2134672403335571, "rewards/rejected": -1.530550241470337, "step": 1380 }, { "epoch": 1.44, "learning_rate": 2.8970531955606583e-07, "logits/chosen": 23.39419937133789, "logits/rejected": 23.142974853515625, "logps/chosen": -336.1145935058594, "logps/rejected": -242.3621826171875, "loss": 0.4514, "rewards/accuracies": 0.875, "rewards/chosen": -0.19971036911010742, "rewards/margins": 1.381958246231079, "rewards/rejected": -1.5816686153411865, "step": 1390 }, { "epoch": 1.45, "learning_rate": 2.8779181017986984e-07, "logits/chosen": 23.005327224731445, "logits/rejected": 22.998939514160156, "logps/chosen": -389.1512756347656, "logps/rejected": -290.7593688964844, "loss": 0.4678, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.16274584829807281, "rewards/margins": 1.0353623628616333, "rewards/rejected": -1.198108196258545, "step": 1400 }, { "epoch": 1.45, "eval_logits/chosen": 23.455062866210938, "eval_logits/rejected": 23.266075134277344, "eval_logps/chosen": -356.9206848144531, "eval_logps/rejected": -285.68414306640625, "eval_loss": 0.5029928684234619, "eval_rewards/accuracies": 0.7817460298538208, "eval_rewards/chosen": -0.23505355417728424, "eval_rewards/margins": 0.9250208735466003, "eval_rewards/rejected": -1.1600743532180786, "eval_runtime": 212.5498, "eval_samples_per_second": 9.41, "eval_steps_per_second": 0.296, "step": 1400 }, { "epoch": 1.46, "learning_rate": 2.858783008036739e-07, "logits/chosen": 23.323863983154297, "logits/rejected": 23.302270889282227, "logps/chosen": -362.5963134765625, "logps/rejected": -284.63519287109375, "loss": 0.4375, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.15233711898326874, "rewards/margins": 1.4009491205215454, "rewards/rejected": -1.5532863140106201, "step": 1410 }, { "epoch": 1.47, "learning_rate": 2.8396479142747797e-07, "logits/chosen": 23.571504592895508, "logits/rejected": 23.417720794677734, "logps/chosen": -313.31170654296875, "logps/rejected": -293.6142272949219, "loss": 0.4351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16742083430290222, "rewards/margins": 1.0415483713150024, "rewards/rejected": -1.208969235420227, "step": 1420 }, { "epoch": 1.48, "learning_rate": 2.8205128205128203e-07, "logits/chosen": 23.33370590209961, "logits/rejected": 23.32365608215332, "logps/chosen": -299.41351318359375, "logps/rejected": -302.18939208984375, "loss": 0.4146, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1101953536272049, "rewards/margins": 1.1390321254730225, "rewards/rejected": -1.2492274045944214, "step": 1430 }, { "epoch": 1.49, "learning_rate": 2.801377726750861e-07, "logits/chosen": 23.091251373291016, "logits/rejected": 23.06249237060547, "logps/chosen": -288.70050048828125, "logps/rejected": -261.63494873046875, "loss": 0.4288, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.41927942633628845, "rewards/margins": 0.8092526197433472, "rewards/rejected": -1.228532075881958, "step": 1440 }, { "epoch": 1.5, "learning_rate": 2.7822426329889015e-07, "logits/chosen": 22.919397354125977, "logits/rejected": 22.959392547607422, "logps/chosen": -316.126953125, "logps/rejected": -247.66845703125, "loss": 0.4636, "rewards/accuracies": 0.875, "rewards/chosen": -0.22512026131153107, "rewards/margins": 1.1928333044052124, "rewards/rejected": -1.4179537296295166, "step": 1450 }, { "epoch": 1.51, "learning_rate": 2.763107539226942e-07, "logits/chosen": 23.120534896850586, "logits/rejected": 22.996898651123047, "logps/chosen": -366.4676513671875, "logps/rejected": -257.8288879394531, "loss": 0.4564, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3532385230064392, "rewards/margins": 1.007889986038208, "rewards/rejected": -1.361128568649292, "step": 1460 }, { "epoch": 1.52, "learning_rate": 2.743972445464983e-07, "logits/chosen": 23.175884246826172, "logits/rejected": 23.074565887451172, "logps/chosen": -285.8373718261719, "logps/rejected": -230.23263549804688, "loss": 0.4406, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.30376359820365906, "rewards/margins": 0.9630700945854187, "rewards/rejected": -1.2668339014053345, "step": 1470 }, { "epoch": 1.53, "learning_rate": 2.7248373517030234e-07, "logits/chosen": 23.030946731567383, "logits/rejected": 23.114126205444336, "logps/chosen": -342.0677185058594, "logps/rejected": -273.91558837890625, "loss": 0.4265, "rewards/accuracies": 0.75, "rewards/chosen": -0.3027496933937073, "rewards/margins": 0.7853070497512817, "rewards/rejected": -1.0880568027496338, "step": 1480 }, { "epoch": 1.54, "learning_rate": 2.705702257941064e-07, "logits/chosen": 23.109298706054688, "logits/rejected": 22.95934295654297, "logps/chosen": -353.9482727050781, "logps/rejected": -293.26165771484375, "loss": 0.3973, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19786301255226135, "rewards/margins": 1.2528201341629028, "rewards/rejected": -1.4506832361221313, "step": 1490 }, { "epoch": 1.55, "learning_rate": 2.686567164179104e-07, "logits/chosen": 23.07802391052246, "logits/rejected": 22.99662208557129, "logps/chosen": -328.4552001953125, "logps/rejected": -250.57901000976562, "loss": 0.4317, "rewards/accuracies": 0.875, "rewards/chosen": -0.1944916695356369, "rewards/margins": 1.2932884693145752, "rewards/rejected": -1.4877803325653076, "step": 1500 }, { "epoch": 1.55, "eval_logits/chosen": 23.452411651611328, "eval_logits/rejected": 23.262121200561523, "eval_logps/chosen": -355.9715576171875, "eval_logps/rejected": -285.541748046875, "eval_loss": 0.49968841671943665, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": -0.1401444375514984, "eval_rewards/margins": 1.0056895017623901, "eval_rewards/rejected": -1.1458338499069214, "eval_runtime": 210.203, "eval_samples_per_second": 9.515, "eval_steps_per_second": 0.3, "step": 1500 }, { "epoch": 1.56, "learning_rate": 2.6674320704171447e-07, "logits/chosen": 23.38498306274414, "logits/rejected": 23.101451873779297, "logps/chosen": -313.84991455078125, "logps/rejected": -229.9058837890625, "loss": 0.4147, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2590484023094177, "rewards/margins": 0.8956319689750671, "rewards/rejected": -1.1546803712844849, "step": 1510 }, { "epoch": 1.57, "learning_rate": 2.6482969766551853e-07, "logits/chosen": 23.376522064208984, "logits/rejected": 23.071407318115234, "logps/chosen": -294.4135437011719, "logps/rejected": -286.2037658691406, "loss": 0.4243, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.2608863413333893, "rewards/margins": 1.1486496925354004, "rewards/rejected": -1.4095360040664673, "step": 1520 }, { "epoch": 1.58, "learning_rate": 2.629161882893226e-07, "logits/chosen": 23.766990661621094, "logits/rejected": 23.534847259521484, "logps/chosen": -363.1257629394531, "logps/rejected": -257.43377685546875, "loss": 0.4044, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.28155818581581116, "rewards/margins": 1.1225359439849854, "rewards/rejected": -1.4040942192077637, "step": 1530 }, { "epoch": 1.59, "learning_rate": 2.6100267891312666e-07, "logits/chosen": 23.727947235107422, "logits/rejected": 23.551546096801758, "logps/chosen": -354.59808349609375, "logps/rejected": -309.74041748046875, "loss": 0.4358, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2375718653202057, "rewards/margins": 0.9404077529907227, "rewards/rejected": -1.177979588508606, "step": 1540 }, { "epoch": 1.6, "learning_rate": 2.590891695369307e-07, "logits/chosen": 23.435352325439453, "logits/rejected": 23.340909957885742, "logps/chosen": -323.2891845703125, "logps/rejected": -256.3253479003906, "loss": 0.4108, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.21504418551921844, "rewards/margins": 1.1017714738845825, "rewards/rejected": -1.316815733909607, "step": 1550 }, { "epoch": 1.61, "learning_rate": 2.571756601607348e-07, "logits/chosen": 23.279882431030273, "logits/rejected": 22.866252899169922, "logps/chosen": -376.61431884765625, "logps/rejected": -252.8503875732422, "loss": 0.4265, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.24882745742797852, "rewards/margins": 1.2063392400741577, "rewards/rejected": -1.4551665782928467, "step": 1560 }, { "epoch": 1.62, "learning_rate": 2.5526215078453884e-07, "logits/chosen": 23.572551727294922, "logits/rejected": 23.359222412109375, "logps/chosen": -348.06396484375, "logps/rejected": -301.94830322265625, "loss": 0.4353, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1706874668598175, "rewards/margins": 1.228468656539917, "rewards/rejected": -1.399156093597412, "step": 1570 }, { "epoch": 1.63, "learning_rate": 2.533486414083429e-07, "logits/chosen": 23.559356689453125, "logits/rejected": 23.3609676361084, "logps/chosen": -354.1952209472656, "logps/rejected": -299.01385498046875, "loss": 0.392, "rewards/accuracies": 0.875, "rewards/chosen": 0.025229115039110184, "rewards/margins": 1.3790032863616943, "rewards/rejected": -1.3537743091583252, "step": 1580 }, { "epoch": 1.64, "learning_rate": 2.5143513203214697e-07, "logits/chosen": 23.176847457885742, "logits/rejected": 23.128990173339844, "logps/chosen": -395.842529296875, "logps/rejected": -295.98162841796875, "loss": 0.4379, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10837908089160919, "rewards/margins": 1.3724091053009033, "rewards/rejected": -1.480788230895996, "step": 1590 }, { "epoch": 1.65, "learning_rate": 2.49521622655951e-07, "logits/chosen": 23.313915252685547, "logits/rejected": 23.103626251220703, "logps/chosen": -350.606689453125, "logps/rejected": -294.15594482421875, "loss": 0.4363, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.2652584910392761, "rewards/margins": 1.1176103353500366, "rewards/rejected": -1.382868766784668, "step": 1600 }, { "epoch": 1.65, "eval_logits/chosen": 23.417835235595703, "eval_logits/rejected": 23.231985092163086, "eval_logps/chosen": -357.8829650878906, "eval_logps/rejected": -287.6752014160156, "eval_loss": 0.5009579062461853, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -0.3312842845916748, "eval_rewards/margins": 1.0278921127319336, "eval_rewards/rejected": -1.3591763973236084, "eval_runtime": 211.3907, "eval_samples_per_second": 9.461, "eval_steps_per_second": 0.298, "step": 1600 }, { "epoch": 1.66, "learning_rate": 2.4760811327975504e-07, "logits/chosen": 23.531373977661133, "logits/rejected": 23.429351806640625, "logps/chosen": -347.46490478515625, "logps/rejected": -289.24176025390625, "loss": 0.4249, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.28774064779281616, "rewards/margins": 1.0696442127227783, "rewards/rejected": -1.3573849201202393, "step": 1610 }, { "epoch": 1.67, "learning_rate": 2.456946039035591e-07, "logits/chosen": 23.160110473632812, "logits/rejected": 23.07761001586914, "logps/chosen": -372.6455993652344, "logps/rejected": -254.6509246826172, "loss": 0.4312, "rewards/accuracies": 0.8125, "rewards/chosen": -0.29867392778396606, "rewards/margins": 1.093621850013733, "rewards/rejected": -1.3922955989837646, "step": 1620 }, { "epoch": 1.68, "learning_rate": 2.4378109452736316e-07, "logits/chosen": 23.148435592651367, "logits/rejected": 23.090463638305664, "logps/chosen": -316.5802917480469, "logps/rejected": -288.7501220703125, "loss": 0.4262, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3632759749889374, "rewards/margins": 1.0216796398162842, "rewards/rejected": -1.3849557638168335, "step": 1630 }, { "epoch": 1.69, "learning_rate": 2.418675851511672e-07, "logits/chosen": 23.17940330505371, "logits/rejected": 23.190204620361328, "logps/chosen": -346.47625732421875, "logps/rejected": -270.1147155761719, "loss": 0.4333, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2974759638309479, "rewards/margins": 1.0174903869628906, "rewards/rejected": -1.3149662017822266, "step": 1640 }, { "epoch": 1.7, "learning_rate": 2.399540757749713e-07, "logits/chosen": 23.138202667236328, "logits/rejected": 22.985610961914062, "logps/chosen": -340.69940185546875, "logps/rejected": -311.3159484863281, "loss": 0.4341, "rewards/accuracies": 0.875, "rewards/chosen": -0.1574067324399948, "rewards/margins": 1.2777574062347412, "rewards/rejected": -1.4351643323898315, "step": 1650 }, { "epoch": 1.71, "learning_rate": 2.3804056639877535e-07, "logits/chosen": 23.348569869995117, "logits/rejected": 23.167980194091797, "logps/chosen": -273.6280822753906, "logps/rejected": -238.4679412841797, "loss": 0.4392, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.03210877254605293, "rewards/margins": 1.156890630722046, "rewards/rejected": -1.1889994144439697, "step": 1660 }, { "epoch": 1.72, "learning_rate": 2.361270570225794e-07, "logits/chosen": 23.42279052734375, "logits/rejected": 23.08903694152832, "logps/chosen": -358.487548828125, "logps/rejected": -266.75341796875, "loss": 0.3848, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0037168667186051607, "rewards/margins": 1.2687807083129883, "rewards/rejected": -1.272497534751892, "step": 1670 }, { "epoch": 1.73, "learning_rate": 2.3421354764638345e-07, "logits/chosen": 23.328954696655273, "logits/rejected": 23.21335792541504, "logps/chosen": -294.31402587890625, "logps/rejected": -263.2884826660156, "loss": 0.44, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1213906854391098, "rewards/margins": 0.9413496255874634, "rewards/rejected": -1.0627403259277344, "step": 1680 }, { "epoch": 1.74, "learning_rate": 2.323000382701875e-07, "logits/chosen": 23.385282516479492, "logits/rejected": 23.229637145996094, "logps/chosen": -392.8078308105469, "logps/rejected": -314.957275390625, "loss": 0.4084, "rewards/accuracies": 0.875, "rewards/chosen": -0.1200430616736412, "rewards/margins": 1.2001924514770508, "rewards/rejected": -1.3202354907989502, "step": 1690 }, { "epoch": 1.76, "learning_rate": 2.3038652889399157e-07, "logits/chosen": 23.32499122619629, "logits/rejected": 23.208293914794922, "logps/chosen": -338.33892822265625, "logps/rejected": -305.3815612792969, "loss": 0.408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3193683624267578, "rewards/margins": 1.028808832168579, "rewards/rejected": -1.348177433013916, "step": 1700 }, { "epoch": 1.76, "eval_logits/chosen": 23.395021438598633, "eval_logits/rejected": 23.213520050048828, "eval_logps/chosen": -357.0264892578125, "eval_logps/rejected": -287.1567687988281, "eval_loss": 0.4989284873008728, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.24563594162464142, "eval_rewards/margins": 1.0617001056671143, "eval_rewards/rejected": -1.3073359727859497, "eval_runtime": 212.6457, "eval_samples_per_second": 9.405, "eval_steps_per_second": 0.296, "step": 1700 }, { "epoch": 1.77, "learning_rate": 2.2847301951779563e-07, "logits/chosen": 23.246747970581055, "logits/rejected": 23.268218994140625, "logps/chosen": -298.40838623046875, "logps/rejected": -294.05877685546875, "loss": 0.4063, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.37956395745277405, "rewards/margins": 1.2896459102630615, "rewards/rejected": -1.6692098379135132, "step": 1710 }, { "epoch": 1.78, "learning_rate": 2.265595101415997e-07, "logits/chosen": 23.170940399169922, "logits/rejected": 23.139057159423828, "logps/chosen": -333.7245178222656, "logps/rejected": -287.9451599121094, "loss": 0.4352, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.23647813498973846, "rewards/margins": 1.159517526626587, "rewards/rejected": -1.3959954977035522, "step": 1720 }, { "epoch": 1.79, "learning_rate": 2.2464600076540373e-07, "logits/chosen": 23.052528381347656, "logits/rejected": 22.96520233154297, "logps/chosen": -327.0295104980469, "logps/rejected": -272.34539794921875, "loss": 0.4108, "rewards/accuracies": 0.75, "rewards/chosen": -0.3730023503303528, "rewards/margins": 1.0741784572601318, "rewards/rejected": -1.4471808671951294, "step": 1730 }, { "epoch": 1.8, "learning_rate": 2.227324913892078e-07, "logits/chosen": 23.350069046020508, "logits/rejected": 23.162134170532227, "logps/chosen": -338.16583251953125, "logps/rejected": -292.6080627441406, "loss": 0.4213, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.24311116337776184, "rewards/margins": 1.0276445150375366, "rewards/rejected": -1.270755648612976, "step": 1740 }, { "epoch": 1.81, "learning_rate": 2.2081898201301186e-07, "logits/chosen": 23.173582077026367, "logits/rejected": 23.145183563232422, "logps/chosen": -329.4697265625, "logps/rejected": -266.3700256347656, "loss": 0.407, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.023505190387368202, "rewards/margins": 1.2212746143341064, "rewards/rejected": -1.2447797060012817, "step": 1750 }, { "epoch": 1.82, "learning_rate": 2.1890547263681592e-07, "logits/chosen": 23.282878875732422, "logits/rejected": 23.070077896118164, "logps/chosen": -321.710205078125, "logps/rejected": -277.3352966308594, "loss": 0.4071, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.25008895993232727, "rewards/margins": 1.3580764532089233, "rewards/rejected": -1.6081653833389282, "step": 1760 }, { "epoch": 1.83, "learning_rate": 2.1699196326061998e-07, "logits/chosen": 23.08016586303711, "logits/rejected": 23.203523635864258, "logps/chosen": -297.59130859375, "logps/rejected": -280.3871765136719, "loss": 0.4554, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3134928345680237, "rewards/margins": 1.07438063621521, "rewards/rejected": -1.387873649597168, "step": 1770 }, { "epoch": 1.84, "learning_rate": 2.1507845388442402e-07, "logits/chosen": 23.223857879638672, "logits/rejected": 23.118579864501953, "logps/chosen": -307.32745361328125, "logps/rejected": -264.244384765625, "loss": 0.4215, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4751613140106201, "rewards/margins": 1.2573087215423584, "rewards/rejected": -1.732469916343689, "step": 1780 }, { "epoch": 1.85, "learning_rate": 2.1316494450822808e-07, "logits/chosen": 23.08175277709961, "logits/rejected": 23.210302352905273, "logps/chosen": -348.2358093261719, "logps/rejected": -269.00909423828125, "loss": 0.4148, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5350313186645508, "rewards/margins": 1.2447912693023682, "rewards/rejected": -1.779822587966919, "step": 1790 }, { "epoch": 1.86, "learning_rate": 2.1125143513203214e-07, "logits/chosen": 23.117109298706055, "logits/rejected": 23.03956413269043, "logps/chosen": -350.7597961425781, "logps/rejected": -257.7859191894531, "loss": 0.4076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.42596331238746643, "rewards/margins": 1.1088837385177612, "rewards/rejected": -1.5348470211029053, "step": 1800 }, { "epoch": 1.86, "eval_logits/chosen": 23.361677169799805, "eval_logits/rejected": 23.18657112121582, "eval_logps/chosen": -358.4737854003906, "eval_logps/rejected": -288.44818115234375, "eval_loss": 0.4995974004268646, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": -0.39036476612091064, "eval_rewards/margins": 1.0461114645004272, "eval_rewards/rejected": -1.4364763498306274, "eval_runtime": 207.254, "eval_samples_per_second": 9.65, "eval_steps_per_second": 0.304, "step": 1800 }, { "epoch": 1.87, "learning_rate": 2.093379257558362e-07, "logits/chosen": 23.419483184814453, "logits/rejected": 23.20507049560547, "logps/chosen": -321.2181396484375, "logps/rejected": -264.3491516113281, "loss": 0.4189, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.5122248530387878, "rewards/margins": 1.2178277969360352, "rewards/rejected": -1.7300525903701782, "step": 1810 }, { "epoch": 1.88, "learning_rate": 2.0742441637964026e-07, "logits/chosen": 22.683643341064453, "logits/rejected": 22.83184242248535, "logps/chosen": -339.54498291015625, "logps/rejected": -250.641845703125, "loss": 0.4102, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.3275443911552429, "rewards/margins": 1.4303803443908691, "rewards/rejected": -1.7579247951507568, "step": 1820 }, { "epoch": 1.89, "learning_rate": 2.055109070034443e-07, "logits/chosen": 23.576740264892578, "logits/rejected": 23.395267486572266, "logps/chosen": -349.0409240722656, "logps/rejected": -268.94268798828125, "loss": 0.4055, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.29274967312812805, "rewards/margins": 1.1743746995925903, "rewards/rejected": -1.4671242237091064, "step": 1830 }, { "epoch": 1.9, "learning_rate": 2.0359739762724836e-07, "logits/chosen": 23.352815628051758, "logits/rejected": 23.3568058013916, "logps/chosen": -352.31304931640625, "logps/rejected": -284.7132873535156, "loss": 0.4133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.33104512095451355, "rewards/margins": 1.168330192565918, "rewards/rejected": -1.499375343322754, "step": 1840 }, { "epoch": 1.91, "learning_rate": 2.0168388825105242e-07, "logits/chosen": 23.261768341064453, "logits/rejected": 23.228496551513672, "logps/chosen": -345.1585998535156, "logps/rejected": -339.8830871582031, "loss": 0.3868, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.12148020416498184, "rewards/margins": 1.3025624752044678, "rewards/rejected": -1.424042820930481, "step": 1850 }, { "epoch": 1.92, "learning_rate": 1.997703788748565e-07, "logits/chosen": 23.22171401977539, "logits/rejected": 22.969928741455078, "logps/chosen": -328.1639099121094, "logps/rejected": -222.90762329101562, "loss": 0.406, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.09366317093372345, "rewards/margins": 1.3427413702011108, "rewards/rejected": -1.4364044666290283, "step": 1860 }, { "epoch": 1.93, "learning_rate": 1.9785686949866055e-07, "logits/chosen": 22.87033462524414, "logits/rejected": 23.068653106689453, "logps/chosen": -331.31146240234375, "logps/rejected": -285.5386962890625, "loss": 0.4169, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.057704973965883255, "rewards/margins": 1.1090171337127686, "rewards/rejected": -1.1667221784591675, "step": 1870 }, { "epoch": 1.94, "learning_rate": 1.9594336012246458e-07, "logits/chosen": 23.055828094482422, "logits/rejected": 23.081539154052734, "logps/chosen": -312.4957275390625, "logps/rejected": -268.7204895019531, "loss": 0.3909, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.07027649134397507, "rewards/margins": 1.1952614784240723, "rewards/rejected": -1.2655378580093384, "step": 1880 }, { "epoch": 1.95, "learning_rate": 1.9402985074626865e-07, "logits/chosen": 22.958328247070312, "logits/rejected": 23.175926208496094, "logps/chosen": -312.7813720703125, "logps/rejected": -272.52215576171875, "loss": 0.4153, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2530692219734192, "rewards/margins": 1.2337100505828857, "rewards/rejected": -1.4867792129516602, "step": 1890 }, { "epoch": 1.96, "learning_rate": 1.921163413700727e-07, "logits/chosen": 23.173688888549805, "logits/rejected": 23.038455963134766, "logps/chosen": -327.50531005859375, "logps/rejected": -257.269287109375, "loss": 0.4547, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3794935941696167, "rewards/margins": 1.0488073825836182, "rewards/rejected": -1.4283010959625244, "step": 1900 }, { "epoch": 1.96, "eval_logits/chosen": 23.329803466796875, "eval_logits/rejected": 23.160478591918945, "eval_logps/chosen": -357.08575439453125, "eval_logps/rejected": -286.7316589355469, "eval_loss": 0.5008072853088379, "eval_rewards/accuracies": 0.7857142686843872, "eval_rewards/chosen": -0.25156161189079285, "eval_rewards/margins": 1.0132601261138916, "eval_rewards/rejected": -1.2648216485977173, "eval_runtime": 212.8249, "eval_samples_per_second": 9.397, "eval_steps_per_second": 0.296, "step": 1900 }, { "epoch": 1.97, "learning_rate": 1.9020283199387677e-07, "logits/chosen": 23.098756790161133, "logits/rejected": 23.10630226135254, "logps/chosen": -345.1263122558594, "logps/rejected": -260.34613037109375, "loss": 0.4359, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.14109382033348083, "rewards/margins": 1.0066546201705933, "rewards/rejected": -1.147748589515686, "step": 1910 }, { "epoch": 1.98, "learning_rate": 1.8828932261768083e-07, "logits/chosen": 23.233016967773438, "logits/rejected": 23.24778175354004, "logps/chosen": -295.8356628417969, "logps/rejected": -244.9901885986328, "loss": 0.4224, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.49083733558654785, "rewards/margins": 1.3773233890533447, "rewards/rejected": -1.868160605430603, "step": 1920 }, { "epoch": 1.99, "learning_rate": 1.8637581324148487e-07, "logits/chosen": 23.030765533447266, "logits/rejected": 22.91606330871582, "logps/chosen": -311.2125549316406, "logps/rejected": -287.0599670410156, "loss": 0.4249, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.3956405520439148, "rewards/margins": 1.194427728652954, "rewards/rejected": -1.5900681018829346, "step": 1930 }, { "epoch": 2.0, "learning_rate": 1.8446230386528893e-07, "logits/chosen": 22.784257888793945, "logits/rejected": 22.93459701538086, "logps/chosen": -270.6285095214844, "logps/rejected": -242.1689453125, "loss": 0.3953, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.2096777856349945, "rewards/margins": 1.2774814367294312, "rewards/rejected": -1.487159252166748, "step": 1940 }, { "epoch": 2.01, "learning_rate": 1.82548794489093e-07, "logits/chosen": 22.975915908813477, "logits/rejected": 23.076732635498047, "logps/chosen": -285.87164306640625, "logps/rejected": -285.27105712890625, "loss": 0.3506, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.4368739724159241, "rewards/margins": 1.2533791065216064, "rewards/rejected": -1.6902532577514648, "step": 1950 }, { "epoch": 2.02, "learning_rate": 1.8063528511289706e-07, "logits/chosen": 23.094058990478516, "logits/rejected": 23.000532150268555, "logps/chosen": -311.85418701171875, "logps/rejected": -342.6611633300781, "loss": 0.3454, "rewards/accuracies": 0.875, "rewards/chosen": -0.3331337571144104, "rewards/margins": 1.497775912284851, "rewards/rejected": -1.8309099674224854, "step": 1960 }, { "epoch": 2.03, "learning_rate": 1.7872177573670112e-07, "logits/chosen": 23.15587043762207, "logits/rejected": 22.996431350708008, "logps/chosen": -284.33099365234375, "logps/rejected": -247.60586547851562, "loss": 0.3782, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.3313903212547302, "rewards/margins": 1.394980549812317, "rewards/rejected": -1.7263710498809814, "step": 1970 }, { "epoch": 2.04, "learning_rate": 1.7680826636050515e-07, "logits/chosen": 23.24311637878418, "logits/rejected": 23.19965171813965, "logps/chosen": -307.9765930175781, "logps/rejected": -258.64697265625, "loss": 0.3464, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.12858574092388153, "rewards/margins": 1.5073888301849365, "rewards/rejected": -1.635974645614624, "step": 1980 }, { "epoch": 2.05, "learning_rate": 1.7489475698430921e-07, "logits/chosen": 23.421756744384766, "logits/rejected": 23.180278778076172, "logps/chosen": -351.59796142578125, "logps/rejected": -271.3390808105469, "loss": 0.3522, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.3069804012775421, "rewards/margins": 1.5511845350265503, "rewards/rejected": -1.8581645488739014, "step": 1990 }, { "epoch": 2.07, "learning_rate": 1.7298124760811328e-07, "logits/chosen": 23.29781723022461, "logits/rejected": 23.215654373168945, "logps/chosen": -335.22943115234375, "logps/rejected": -245.5157470703125, "loss": 0.3469, "rewards/accuracies": 0.875, "rewards/chosen": -0.39654579758644104, "rewards/margins": 1.623525857925415, "rewards/rejected": -2.020071506500244, "step": 2000 }, { "epoch": 2.07, "eval_logits/chosen": 23.29904556274414, "eval_logits/rejected": 23.136056900024414, "eval_logps/chosen": -357.43829345703125, "eval_logps/rejected": -287.9998779296875, "eval_loss": 0.49774664640426636, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.28681743144989014, "eval_rewards/margins": 1.10482656955719, "eval_rewards/rejected": -1.39164400100708, "eval_runtime": 207.5885, "eval_samples_per_second": 9.634, "eval_steps_per_second": 0.303, "step": 2000 }, { "epoch": 2.08, "learning_rate": 1.7106773823191734e-07, "logits/chosen": 23.3192138671875, "logits/rejected": 23.1693115234375, "logps/chosen": -360.62701416015625, "logps/rejected": -267.00836181640625, "loss": 0.3505, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2135746031999588, "rewards/margins": 1.6586072444915771, "rewards/rejected": -1.8721816539764404, "step": 2010 }, { "epoch": 2.09, "learning_rate": 1.691542288557214e-07, "logits/chosen": 22.96431541442871, "logits/rejected": 22.824430465698242, "logps/chosen": -327.3959655761719, "logps/rejected": -293.3819580078125, "loss": 0.3412, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4780098795890808, "rewards/margins": 1.3323651552200317, "rewards/rejected": -1.8103749752044678, "step": 2020 }, { "epoch": 2.1, "learning_rate": 1.6724071947952544e-07, "logits/chosen": 22.99736213684082, "logits/rejected": 22.809282302856445, "logps/chosen": -286.00433349609375, "logps/rejected": -253.54635620117188, "loss": 0.3509, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.5133158564567566, "rewards/margins": 1.4120731353759766, "rewards/rejected": -1.9253889322280884, "step": 2030 }, { "epoch": 2.11, "learning_rate": 1.653272101033295e-07, "logits/chosen": 22.923995971679688, "logits/rejected": 22.87368392944336, "logps/chosen": -293.9073791503906, "logps/rejected": -263.51397705078125, "loss": 0.3527, "rewards/accuracies": 0.875, "rewards/chosen": -0.28854408860206604, "rewards/margins": 1.3332937955856323, "rewards/rejected": -1.6218379735946655, "step": 2040 }, { "epoch": 2.12, "learning_rate": 1.6341370072713356e-07, "logits/chosen": 23.294551849365234, "logits/rejected": 23.248403549194336, "logps/chosen": -354.1997985839844, "logps/rejected": -338.943603515625, "loss": 0.3538, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2328769713640213, "rewards/margins": 1.2254259586334229, "rewards/rejected": -1.4583029747009277, "step": 2050 }, { "epoch": 2.13, "learning_rate": 1.6150019135093762e-07, "logits/chosen": 23.40046501159668, "logits/rejected": 23.170814514160156, "logps/chosen": -376.11871337890625, "logps/rejected": -280.2356872558594, "loss": 0.3409, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.1297599822282791, "rewards/margins": 1.5910810232162476, "rewards/rejected": -1.7208411693572998, "step": 2060 }, { "epoch": 2.14, "learning_rate": 1.5958668197474169e-07, "logits/chosen": 23.402286529541016, "logits/rejected": 23.344829559326172, "logps/chosen": -316.00634765625, "logps/rejected": -308.0472717285156, "loss": 0.3556, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.00810793973505497, "rewards/margins": 1.592138409614563, "rewards/rejected": -1.5840303897857666, "step": 2070 }, { "epoch": 2.15, "learning_rate": 1.5767317259854572e-07, "logits/chosen": 23.049579620361328, "logits/rejected": 23.11331558227539, "logps/chosen": -320.4759826660156, "logps/rejected": -237.2650604248047, "loss": 0.3655, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.1983695775270462, "rewards/margins": 1.2847638130187988, "rewards/rejected": -1.4831334352493286, "step": 2080 }, { "epoch": 2.16, "learning_rate": 1.5575966322234978e-07, "logits/chosen": 23.234739303588867, "logits/rejected": 22.95262336730957, "logps/chosen": -360.20367431640625, "logps/rejected": -292.31317138671875, "loss": 0.3307, "rewards/accuracies": 0.875, "rewards/chosen": -0.17867961525917053, "rewards/margins": 1.6552801132202148, "rewards/rejected": -1.8339598178863525, "step": 2090 }, { "epoch": 2.17, "learning_rate": 1.5384615384615385e-07, "logits/chosen": 23.25923728942871, "logits/rejected": 23.007633209228516, "logps/chosen": -305.6047058105469, "logps/rejected": -246.4465789794922, "loss": 0.3547, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2684822380542755, "rewards/margins": 1.560947060585022, "rewards/rejected": -1.829429268836975, "step": 2100 }, { "epoch": 2.17, "eval_logits/chosen": 23.273019790649414, "eval_logits/rejected": 23.114229202270508, "eval_logps/chosen": -358.821044921875, "eval_logps/rejected": -289.5934753417969, "eval_loss": 0.49868160486221313, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": -0.42508772015571594, "eval_rewards/margins": 1.125916838645935, "eval_rewards/rejected": -1.5510046482086182, "eval_runtime": 211.1219, "eval_samples_per_second": 9.473, "eval_steps_per_second": 0.298, "step": 2100 }, { "epoch": 2.18, "learning_rate": 1.519326444699579e-07, "logits/chosen": 23.230274200439453, "logits/rejected": 23.15807342529297, "logps/chosen": -334.8540954589844, "logps/rejected": -263.5167236328125, "loss": 0.3289, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.251259982585907, "rewards/margins": 1.5572983026504517, "rewards/rejected": -1.8085582256317139, "step": 2110 }, { "epoch": 2.19, "learning_rate": 1.5001913509376197e-07, "logits/chosen": 23.217041015625, "logits/rejected": 23.1701717376709, "logps/chosen": -330.7687683105469, "logps/rejected": -285.98907470703125, "loss": 0.3463, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.2551344037055969, "rewards/margins": 1.4968717098236084, "rewards/rejected": -1.7520062923431396, "step": 2120 }, { "epoch": 2.2, "learning_rate": 1.4810562571756603e-07, "logits/chosen": 23.419551849365234, "logits/rejected": 23.216039657592773, "logps/chosen": -312.8890380859375, "logps/rejected": -267.16729736328125, "loss": 0.3688, "rewards/accuracies": 0.8125, "rewards/chosen": -0.34911391139030457, "rewards/margins": 1.548995018005371, "rewards/rejected": -1.8981088399887085, "step": 2130 }, { "epoch": 2.21, "learning_rate": 1.4619211634137007e-07, "logits/chosen": 22.947824478149414, "logits/rejected": 22.82015037536621, "logps/chosen": -313.2303771972656, "logps/rejected": -252.9609375, "loss": 0.3447, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2606348693370819, "rewards/margins": 1.5146172046661377, "rewards/rejected": -1.7752519845962524, "step": 2140 }, { "epoch": 2.22, "learning_rate": 1.4427860696517413e-07, "logits/chosen": 23.21828269958496, "logits/rejected": 23.22684097290039, "logps/chosen": -369.90924072265625, "logps/rejected": -314.48016357421875, "loss": 0.3561, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.12711670994758606, "rewards/margins": 1.666666030883789, "rewards/rejected": -1.7937828302383423, "step": 2150 }, { "epoch": 2.23, "learning_rate": 1.423650975889782e-07, "logits/chosen": 23.231754302978516, "logits/rejected": 23.19542694091797, "logps/chosen": -351.79913330078125, "logps/rejected": -271.44427490234375, "loss": 0.3303, "rewards/accuracies": 0.875, "rewards/chosen": -0.14719603955745697, "rewards/margins": 1.6395785808563232, "rewards/rejected": -1.7867748737335205, "step": 2160 }, { "epoch": 2.24, "learning_rate": 1.4045158821278225e-07, "logits/chosen": 22.95505142211914, "logits/rejected": 22.855873107910156, "logps/chosen": -340.75238037109375, "logps/rejected": -316.02386474609375, "loss": 0.3293, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.26123401522636414, "rewards/margins": 1.4234856367111206, "rewards/rejected": -1.6847198009490967, "step": 2170 }, { "epoch": 2.25, "learning_rate": 1.3853807883658632e-07, "logits/chosen": 23.029155731201172, "logits/rejected": 23.120052337646484, "logps/chosen": -371.22100830078125, "logps/rejected": -319.261474609375, "loss": 0.3694, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.3324764668941498, "rewards/margins": 1.6106780767440796, "rewards/rejected": -1.9431545734405518, "step": 2180 }, { "epoch": 2.26, "learning_rate": 1.3662456946039035e-07, "logits/chosen": 23.185413360595703, "logits/rejected": 22.925167083740234, "logps/chosen": -305.14410400390625, "logps/rejected": -296.2259521484375, "loss": 0.3664, "rewards/accuracies": 0.8125, "rewards/chosen": -0.40003857016563416, "rewards/margins": 1.3513580560684204, "rewards/rejected": -1.7513965368270874, "step": 2190 }, { "epoch": 2.27, "learning_rate": 1.3471106008419441e-07, "logits/chosen": 23.18272590637207, "logits/rejected": 23.115558624267578, "logps/chosen": -306.1884460449219, "logps/rejected": -288.0213928222656, "loss": 0.3468, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.359417200088501, "rewards/margins": 1.435274362564087, "rewards/rejected": -1.7946914434432983, "step": 2200 }, { "epoch": 2.27, "eval_logits/chosen": 23.256072998046875, "eval_logits/rejected": 23.099788665771484, "eval_logps/chosen": -357.2442932128906, "eval_logps/rejected": -288.0285339355469, "eval_loss": 0.49792206287384033, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.26741600036621094, "eval_rewards/margins": 1.1270908117294312, "eval_rewards/rejected": -1.394506812095642, "eval_runtime": 210.9966, "eval_samples_per_second": 9.479, "eval_steps_per_second": 0.299, "step": 2200 }, { "epoch": 2.28, "learning_rate": 1.3279755070799848e-07, "logits/chosen": 23.182937622070312, "logits/rejected": 23.00518035888672, "logps/chosen": -325.1390075683594, "logps/rejected": -254.9105224609375, "loss": 0.3562, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2701284885406494, "rewards/margins": 1.4486573934555054, "rewards/rejected": -1.7187858819961548, "step": 2210 }, { "epoch": 2.29, "learning_rate": 1.3088404133180254e-07, "logits/chosen": 23.192270278930664, "logits/rejected": 22.86314582824707, "logps/chosen": -354.3294677734375, "logps/rejected": -277.08319091796875, "loss": 0.3275, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3135210871696472, "rewards/margins": 1.6535711288452148, "rewards/rejected": -1.9670922756195068, "step": 2220 }, { "epoch": 2.3, "learning_rate": 1.289705319556066e-07, "logits/chosen": 23.285938262939453, "logits/rejected": 23.20859146118164, "logps/chosen": -341.8558044433594, "logps/rejected": -260.49853515625, "loss": 0.3339, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.49324899911880493, "rewards/margins": 1.3661364316940308, "rewards/rejected": -1.8593854904174805, "step": 2230 }, { "epoch": 2.31, "learning_rate": 1.2705702257941064e-07, "logits/chosen": 23.054574966430664, "logits/rejected": 22.94180679321289, "logps/chosen": -314.9513244628906, "logps/rejected": -253.89779663085938, "loss": 0.3683, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.5942099690437317, "rewards/margins": 1.3034783601760864, "rewards/rejected": -1.8976882696151733, "step": 2240 }, { "epoch": 2.32, "learning_rate": 1.251435132032147e-07, "logits/chosen": 23.040285110473633, "logits/rejected": 23.092458724975586, "logps/chosen": -347.84820556640625, "logps/rejected": -300.02069091796875, "loss": 0.3424, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3747033476829529, "rewards/margins": 1.4116895198822021, "rewards/rejected": -1.7863928079605103, "step": 2250 }, { "epoch": 2.33, "learning_rate": 1.2323000382701873e-07, "logits/chosen": 23.061431884765625, "logits/rejected": 22.941814422607422, "logps/chosen": -363.65850830078125, "logps/rejected": -299.712890625, "loss": 0.3701, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.34447842836380005, "rewards/margins": 1.6744133234024048, "rewards/rejected": -2.0188918113708496, "step": 2260 }, { "epoch": 2.34, "learning_rate": 1.213164944508228e-07, "logits/chosen": 23.18351936340332, "logits/rejected": 23.060955047607422, "logps/chosen": -386.75701904296875, "logps/rejected": -311.6101379394531, "loss": 0.3375, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.2950562834739685, "rewards/margins": 1.5026452541351318, "rewards/rejected": -1.7977014780044556, "step": 2270 }, { "epoch": 2.35, "learning_rate": 1.1940298507462686e-07, "logits/chosen": 23.240421295166016, "logits/rejected": 23.235857009887695, "logps/chosen": -306.81561279296875, "logps/rejected": -249.85324096679688, "loss": 0.3425, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.47128137946128845, "rewards/margins": 1.526531457901001, "rewards/rejected": -1.9978128671646118, "step": 2280 }, { "epoch": 2.36, "learning_rate": 1.1748947569843092e-07, "logits/chosen": 23.31965446472168, "logits/rejected": 23.00503158569336, "logps/chosen": -366.4286193847656, "logps/rejected": -287.65399169921875, "loss": 0.3404, "rewards/accuracies": 0.875, "rewards/chosen": -0.20483896136283875, "rewards/margins": 1.4863460063934326, "rewards/rejected": -1.6911849975585938, "step": 2290 }, { "epoch": 2.37, "learning_rate": 1.1557596632223497e-07, "logits/chosen": 23.109445571899414, "logits/rejected": 23.121734619140625, "logps/chosen": -339.75689697265625, "logps/rejected": -266.4504699707031, "loss": 0.3432, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32633692026138306, "rewards/margins": 1.3159123659133911, "rewards/rejected": -1.642249345779419, "step": 2300 }, { "epoch": 2.37, "eval_logits/chosen": 23.223342895507812, "eval_logits/rejected": 23.0726318359375, "eval_logps/chosen": -358.362060546875, "eval_logps/rejected": -288.7130126953125, "eval_loss": 0.5026321411132812, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -0.3791937828063965, "eval_rewards/margins": 1.0837651491165161, "eval_rewards/rejected": -1.4629590511322021, "eval_runtime": 212.4288, "eval_samples_per_second": 9.415, "eval_steps_per_second": 0.297, "step": 2300 }, { "epoch": 2.39, "learning_rate": 1.1366245694603903e-07, "logits/chosen": 22.923072814941406, "logits/rejected": 22.96480369567871, "logps/chosen": -329.4259338378906, "logps/rejected": -294.24127197265625, "loss": 0.3706, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3439275026321411, "rewards/margins": 1.2762069702148438, "rewards/rejected": -1.6201345920562744, "step": 2310 }, { "epoch": 2.4, "learning_rate": 1.1174894756984308e-07, "logits/chosen": 23.266117095947266, "logits/rejected": 23.19167137145996, "logps/chosen": -274.62237548828125, "logps/rejected": -247.6970672607422, "loss": 0.3588, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.4183998107910156, "rewards/margins": 1.5969486236572266, "rewards/rejected": -2.015348434448242, "step": 2320 }, { "epoch": 2.41, "learning_rate": 1.0983543819364714e-07, "logits/chosen": 23.18532943725586, "logits/rejected": 23.074344635009766, "logps/chosen": -333.7981262207031, "logps/rejected": -262.7727966308594, "loss": 0.3035, "rewards/accuracies": 0.875, "rewards/chosen": -0.3411490321159363, "rewards/margins": 1.6993077993392944, "rewards/rejected": -2.040456771850586, "step": 2330 }, { "epoch": 2.42, "learning_rate": 1.079219288174512e-07, "logits/chosen": 22.650278091430664, "logits/rejected": 22.700809478759766, "logps/chosen": -256.5923156738281, "logps/rejected": -267.3676452636719, "loss": 0.3527, "rewards/accuracies": 0.875, "rewards/chosen": -0.5605247616767883, "rewards/margins": 1.3785268068313599, "rewards/rejected": -1.939051628112793, "step": 2340 }, { "epoch": 2.43, "learning_rate": 1.0600841944125525e-07, "logits/chosen": 23.22505760192871, "logits/rejected": 23.044265747070312, "logps/chosen": -377.84765625, "logps/rejected": -308.9931945800781, "loss": 0.3613, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2691905200481415, "rewards/margins": 1.7195707559585571, "rewards/rejected": -1.988761305809021, "step": 2350 }, { "epoch": 2.44, "learning_rate": 1.0409491006505931e-07, "logits/chosen": 23.209278106689453, "logits/rejected": 23.045013427734375, "logps/chosen": -341.085205078125, "logps/rejected": -284.80987548828125, "loss": 0.3373, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.32976096868515015, "rewards/margins": 1.4313738346099854, "rewards/rejected": -1.7611347436904907, "step": 2360 }, { "epoch": 2.45, "learning_rate": 1.0218140068886336e-07, "logits/chosen": 22.989057540893555, "logits/rejected": 22.930797576904297, "logps/chosen": -349.1778564453125, "logps/rejected": -276.46905517578125, "loss": 0.351, "rewards/accuracies": 0.875, "rewards/chosen": -0.3507656157016754, "rewards/margins": 1.4063034057617188, "rewards/rejected": -1.7570692300796509, "step": 2370 }, { "epoch": 2.46, "learning_rate": 1.0026789131266743e-07, "logits/chosen": 22.69498062133789, "logits/rejected": 22.762619018554688, "logps/chosen": -319.37701416015625, "logps/rejected": -285.0171813964844, "loss": 0.3298, "rewards/accuracies": 0.875, "rewards/chosen": -0.41146141290664673, "rewards/margins": 1.4631173610687256, "rewards/rejected": -1.874578833580017, "step": 2380 }, { "epoch": 2.47, "learning_rate": 9.835438193647149e-08, "logits/chosen": 22.913543701171875, "logits/rejected": 22.7869873046875, "logps/chosen": -328.5509948730469, "logps/rejected": -253.9607696533203, "loss": 0.3565, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.4099615216255188, "rewards/margins": 1.3734912872314453, "rewards/rejected": -1.7834527492523193, "step": 2390 }, { "epoch": 2.48, "learning_rate": 9.644087256027554e-08, "logits/chosen": 23.1809139251709, "logits/rejected": 23.06944465637207, "logps/chosen": -286.20904541015625, "logps/rejected": -235.44140625, "loss": 0.324, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5443722009658813, "rewards/margins": 1.3517777919769287, "rewards/rejected": -1.8961498737335205, "step": 2400 }, { "epoch": 2.48, "eval_logits/chosen": 23.200559616088867, "eval_logits/rejected": 23.054319381713867, "eval_logps/chosen": -359.46197509765625, "eval_logps/rejected": -290.1737060546875, "eval_loss": 0.5021990537643433, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -0.48918139934539795, "eval_rewards/margins": 1.1198451519012451, "eval_rewards/rejected": -1.609026551246643, "eval_runtime": 211.6095, "eval_samples_per_second": 9.451, "eval_steps_per_second": 0.298, "step": 2400 }, { "epoch": 2.49, "learning_rate": 9.45273631840796e-08, "logits/chosen": 22.888259887695312, "logits/rejected": 22.884002685546875, "logps/chosen": -351.7564392089844, "logps/rejected": -299.20611572265625, "loss": 0.3645, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.16312038898468018, "rewards/margins": 1.5724434852600098, "rewards/rejected": -1.73556387424469, "step": 2410 }, { "epoch": 2.5, "learning_rate": 9.261385380788366e-08, "logits/chosen": 23.231523513793945, "logits/rejected": 23.333255767822266, "logps/chosen": -338.1724548339844, "logps/rejected": -284.2950744628906, "loss": 0.3772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.48969048261642456, "rewards/margins": 1.1433426141738892, "rewards/rejected": -1.633033037185669, "step": 2420 }, { "epoch": 2.51, "learning_rate": 9.070034443168771e-08, "logits/chosen": 23.16311264038086, "logits/rejected": 22.952455520629883, "logps/chosen": -304.960205078125, "logps/rejected": -273.458251953125, "loss": 0.3659, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.46648114919662476, "rewards/margins": 1.2427117824554443, "rewards/rejected": -1.7091929912567139, "step": 2430 }, { "epoch": 2.52, "learning_rate": 8.878683505549177e-08, "logits/chosen": 23.104694366455078, "logits/rejected": 23.080604553222656, "logps/chosen": -287.4930419921875, "logps/rejected": -265.7315368652344, "loss": 0.3395, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0566287636756897, "rewards/margins": 1.7359685897827148, "rewards/rejected": -1.7925974130630493, "step": 2440 }, { "epoch": 2.53, "learning_rate": 8.687332567929582e-08, "logits/chosen": 22.983219146728516, "logits/rejected": 23.094844818115234, "logps/chosen": -316.54547119140625, "logps/rejected": -292.8838806152344, "loss": 0.3579, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4759410321712494, "rewards/margins": 1.2076809406280518, "rewards/rejected": -1.6836220026016235, "step": 2450 }, { "epoch": 2.54, "learning_rate": 8.495981630309988e-08, "logits/chosen": 22.804758071899414, "logits/rejected": 22.753246307373047, "logps/chosen": -366.3809509277344, "logps/rejected": -290.44805908203125, "loss": 0.3613, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.37290042638778687, "rewards/margins": 1.3757580518722534, "rewards/rejected": -1.7486584186553955, "step": 2460 }, { "epoch": 2.55, "learning_rate": 8.304630692690395e-08, "logits/chosen": 22.425283432006836, "logits/rejected": 22.801471710205078, "logps/chosen": -316.58416748046875, "logps/rejected": -268.90423583984375, "loss": 0.3686, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.44941702485084534, "rewards/margins": 1.3106696605682373, "rewards/rejected": -1.7600864171981812, "step": 2470 }, { "epoch": 2.56, "learning_rate": 8.1132797550708e-08, "logits/chosen": 23.177642822265625, "logits/rejected": 23.025049209594727, "logps/chosen": -345.25958251953125, "logps/rejected": -269.18951416015625, "loss": 0.3337, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.33066803216934204, "rewards/margins": 1.614101767539978, "rewards/rejected": -1.9447696208953857, "step": 2480 }, { "epoch": 2.57, "learning_rate": 7.921928817451206e-08, "logits/chosen": 23.251216888427734, "logits/rejected": 23.127471923828125, "logps/chosen": -321.37078857421875, "logps/rejected": -251.3941192626953, "loss": 0.3158, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.3629501461982727, "rewards/margins": 1.3023030757904053, "rewards/rejected": -1.6652530431747437, "step": 2490 }, { "epoch": 2.58, "learning_rate": 7.73057787983161e-08, "logits/chosen": 23.11884880065918, "logits/rejected": 23.025787353515625, "logps/chosen": -372.3116760253906, "logps/rejected": -297.9188232421875, "loss": 0.3556, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08432115614414215, "rewards/margins": 1.5375818014144897, "rewards/rejected": -1.6219028234481812, "step": 2500 }, { "epoch": 2.58, "eval_logits/chosen": 23.198068618774414, "eval_logits/rejected": 23.05204963684082, "eval_logps/chosen": -359.8403625488281, "eval_logps/rejected": -290.6595458984375, "eval_loss": 0.5010030269622803, "eval_rewards/accuracies": 0.7817460298538208, "eval_rewards/chosen": -0.5270243287086487, "eval_rewards/margins": 1.130587100982666, "eval_rewards/rejected": -1.6576114892959595, "eval_runtime": 208.0836, "eval_samples_per_second": 9.612, "eval_steps_per_second": 0.303, "step": 2500 }, { "epoch": 2.59, "learning_rate": 7.539226942212017e-08, "logits/chosen": 22.56097412109375, "logits/rejected": 22.520360946655273, "logps/chosen": -299.781982421875, "logps/rejected": -319.82171630859375, "loss": 0.3419, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4395579397678375, "rewards/margins": 1.4742848873138428, "rewards/rejected": -1.9138429164886475, "step": 2510 }, { "epoch": 2.6, "learning_rate": 7.347876004592423e-08, "logits/chosen": 23.056285858154297, "logits/rejected": 22.88377571105957, "logps/chosen": -274.74249267578125, "logps/rejected": -217.62075805664062, "loss": 0.3617, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.48249778151512146, "rewards/margins": 1.2477834224700928, "rewards/rejected": -1.730281114578247, "step": 2520 }, { "epoch": 2.61, "learning_rate": 7.156525066972828e-08, "logits/chosen": 23.401355743408203, "logits/rejected": 23.3753604888916, "logps/chosen": -305.857666015625, "logps/rejected": -269.84344482421875, "loss": 0.3566, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5294305086135864, "rewards/margins": 1.2906124591827393, "rewards/rejected": -1.8200428485870361, "step": 2530 }, { "epoch": 2.62, "learning_rate": 6.965174129353234e-08, "logits/chosen": 23.210468292236328, "logits/rejected": 23.068767547607422, "logps/chosen": -418.8457946777344, "logps/rejected": -322.9905700683594, "loss": 0.3506, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12323112785816193, "rewards/margins": 1.8070284128189087, "rewards/rejected": -1.9302597045898438, "step": 2540 }, { "epoch": 2.63, "learning_rate": 6.773823191733639e-08, "logits/chosen": 23.159671783447266, "logits/rejected": 23.031639099121094, "logps/chosen": -341.6581726074219, "logps/rejected": -329.7276611328125, "loss": 0.3489, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.43813997507095337, "rewards/margins": 1.6730334758758545, "rewards/rejected": -2.111173152923584, "step": 2550 }, { "epoch": 2.64, "learning_rate": 6.582472254114045e-08, "logits/chosen": 22.655208587646484, "logits/rejected": 22.458200454711914, "logps/chosen": -298.61737060546875, "logps/rejected": -273.9313659667969, "loss": 0.3689, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.464535653591156, "rewards/margins": 1.3690606355667114, "rewards/rejected": -1.8335964679718018, "step": 2560 }, { "epoch": 2.65, "learning_rate": 6.391121316494451e-08, "logits/chosen": 23.089435577392578, "logits/rejected": 23.145009994506836, "logps/chosen": -347.81793212890625, "logps/rejected": -289.2439880371094, "loss": 0.3245, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.26666170358657837, "rewards/margins": 1.7196756601333618, "rewards/rejected": -1.9863373041152954, "step": 2570 }, { "epoch": 2.66, "learning_rate": 6.199770378874856e-08, "logits/chosen": 23.03936767578125, "logits/rejected": 22.83783531188965, "logps/chosen": -326.26123046875, "logps/rejected": -285.95294189453125, "loss": 0.3369, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.3959008455276489, "rewards/margins": 1.5528860092163086, "rewards/rejected": -1.948786735534668, "step": 2580 }, { "epoch": 2.67, "learning_rate": 6.008419441255262e-08, "logits/chosen": 22.73525047302246, "logits/rejected": 22.791269302368164, "logps/chosen": -291.1043395996094, "logps/rejected": -250.67880249023438, "loss": 0.3344, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.7139278650283813, "rewards/margins": 1.2345958948135376, "rewards/rejected": -1.948523759841919, "step": 2590 }, { "epoch": 2.68, "learning_rate": 5.817068503635668e-08, "logits/chosen": 23.139039993286133, "logits/rejected": 23.094404220581055, "logps/chosen": -375.664794921875, "logps/rejected": -297.1061706542969, "loss": 0.3277, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.29812633991241455, "rewards/margins": 1.5680840015411377, "rewards/rejected": -1.8662105798721313, "step": 2600 }, { "epoch": 2.68, "eval_logits/chosen": 23.19009780883789, "eval_logits/rejected": 23.044872283935547, "eval_logps/chosen": -359.9708251953125, "eval_logps/rejected": -290.89959716796875, "eval_loss": 0.49901142716407776, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.5400659441947937, "eval_rewards/margins": 1.141547679901123, "eval_rewards/rejected": -1.6816134452819824, "eval_runtime": 212.6416, "eval_samples_per_second": 9.405, "eval_steps_per_second": 0.296, "step": 2600 }, { "epoch": 2.69, "learning_rate": 5.6257175660160735e-08, "logits/chosen": 23.452491760253906, "logits/rejected": 23.291522979736328, "logps/chosen": -321.8898010253906, "logps/rejected": -302.38250732421875, "loss": 0.3198, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.5151566863059998, "rewards/margins": 1.5868747234344482, "rewards/rejected": -2.1020312309265137, "step": 2610 }, { "epoch": 2.71, "learning_rate": 5.4343666283964784e-08, "logits/chosen": 22.93613052368164, "logits/rejected": 23.02700424194336, "logps/chosen": -337.6763000488281, "logps/rejected": -263.98406982421875, "loss": 0.3544, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.38289493322372437, "rewards/margins": 1.6999647617340088, "rewards/rejected": -2.0828592777252197, "step": 2620 }, { "epoch": 2.72, "learning_rate": 5.243015690776884e-08, "logits/chosen": 23.027408599853516, "logits/rejected": 23.09657096862793, "logps/chosen": -300.5941162109375, "logps/rejected": -263.323486328125, "loss": 0.3481, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6709809899330139, "rewards/margins": 1.2178490161895752, "rewards/rejected": -1.8888299465179443, "step": 2630 }, { "epoch": 2.73, "learning_rate": 5.05166475315729e-08, "logits/chosen": 23.28525161743164, "logits/rejected": 23.195045471191406, "logps/chosen": -272.472900390625, "logps/rejected": -262.8435974121094, "loss": 0.3379, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8382778167724609, "rewards/margins": 1.3932462930679321, "rewards/rejected": -2.2315242290496826, "step": 2640 }, { "epoch": 2.74, "learning_rate": 4.860313815537696e-08, "logits/chosen": 22.973094940185547, "logits/rejected": 22.961816787719727, "logps/chosen": -367.30596923828125, "logps/rejected": -294.1488952636719, "loss": 0.3489, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.19501671195030212, "rewards/margins": 1.6359647512435913, "rewards/rejected": -1.8309814929962158, "step": 2650 }, { "epoch": 2.75, "learning_rate": 4.668962877918101e-08, "logits/chosen": 23.02678680419922, "logits/rejected": 22.784521102905273, "logps/chosen": -329.62030029296875, "logps/rejected": -373.6632080078125, "loss": 0.3306, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.4452723562717438, "rewards/margins": 1.5809751749038696, "rewards/rejected": -2.026247501373291, "step": 2660 }, { "epoch": 2.76, "learning_rate": 4.477611940298507e-08, "logits/chosen": 22.902379989624023, "logits/rejected": 22.912425994873047, "logps/chosen": -332.11102294921875, "logps/rejected": -280.44976806640625, "loss": 0.3247, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.35678499937057495, "rewards/margins": 1.8047094345092773, "rewards/rejected": -2.161494493484497, "step": 2670 }, { "epoch": 2.77, "learning_rate": 4.2862610026789124e-08, "logits/chosen": 23.08858871459961, "logits/rejected": 22.95041275024414, "logps/chosen": -337.7413330078125, "logps/rejected": -293.63623046875, "loss": 0.3618, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3582797646522522, "rewards/margins": 1.579685091972351, "rewards/rejected": -1.9379650354385376, "step": 2680 }, { "epoch": 2.78, "learning_rate": 4.0949100650593186e-08, "logits/chosen": 22.903911590576172, "logits/rejected": 22.945873260498047, "logps/chosen": -272.99493408203125, "logps/rejected": -277.4879455566406, "loss": 0.3657, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6198464035987854, "rewards/margins": 1.3183465003967285, "rewards/rejected": -1.9381929636001587, "step": 2690 }, { "epoch": 2.79, "learning_rate": 3.903559127439724e-08, "logits/chosen": 23.07727813720703, "logits/rejected": 22.927719116210938, "logps/chosen": -282.8468933105469, "logps/rejected": -237.0935821533203, "loss": 0.3262, "rewards/accuracies": 0.875, "rewards/chosen": -0.40676426887512207, "rewards/margins": 1.3733875751495361, "rewards/rejected": -1.7801517248153687, "step": 2700 }, { "epoch": 2.79, "eval_logits/chosen": 23.187774658203125, "eval_logits/rejected": 23.043867111206055, "eval_logps/chosen": -359.5220031738281, "eval_logps/rejected": -290.49322509765625, "eval_loss": 0.4993184804916382, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.4951845407485962, "eval_rewards/margins": 1.1457940340042114, "eval_rewards/rejected": -1.6409783363342285, "eval_runtime": 210.8376, "eval_samples_per_second": 9.486, "eval_steps_per_second": 0.299, "step": 2700 }, { "epoch": 2.8, "learning_rate": 3.71220818982013e-08, "logits/chosen": 23.060585021972656, "logits/rejected": 22.836994171142578, "logps/chosen": -348.59539794921875, "logps/rejected": -282.60064697265625, "loss": 0.3585, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4623526632785797, "rewards/margins": 1.4493197202682495, "rewards/rejected": -1.911672592163086, "step": 2710 }, { "epoch": 2.81, "learning_rate": 3.520857252200535e-08, "logits/chosen": 23.332260131835938, "logits/rejected": 23.22934341430664, "logps/chosen": -373.9750061035156, "logps/rejected": -321.8055725097656, "loss": 0.334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.24776187539100647, "rewards/margins": 1.6318897008895874, "rewards/rejected": -1.879651427268982, "step": 2720 }, { "epoch": 2.82, "learning_rate": 3.3295063145809414e-08, "logits/chosen": 23.10513687133789, "logits/rejected": 23.070053100585938, "logps/chosen": -295.7916259765625, "logps/rejected": -298.29132080078125, "loss": 0.3567, "rewards/accuracies": 0.875, "rewards/chosen": -0.4222725033760071, "rewards/margins": 1.4338531494140625, "rewards/rejected": -1.8561254739761353, "step": 2730 }, { "epoch": 2.83, "learning_rate": 3.138155376961347e-08, "logits/chosen": 22.993267059326172, "logits/rejected": 22.975433349609375, "logps/chosen": -340.28515625, "logps/rejected": -270.3987731933594, "loss": 0.3505, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4524230360984802, "rewards/margins": 1.500270962715149, "rewards/rejected": -1.9526941776275635, "step": 2740 }, { "epoch": 2.84, "learning_rate": 2.9468044393417525e-08, "logits/chosen": 22.807130813598633, "logits/rejected": 22.657257080078125, "logps/chosen": -302.7679748535156, "logps/rejected": -253.1012420654297, "loss": 0.3457, "rewards/accuracies": 0.875, "rewards/chosen": -0.3587990403175354, "rewards/margins": 1.4861528873443604, "rewards/rejected": -1.8449519872665405, "step": 2750 }, { "epoch": 2.85, "learning_rate": 2.755453501722158e-08, "logits/chosen": 22.716732025146484, "logits/rejected": 22.806201934814453, "logps/chosen": -340.51287841796875, "logps/rejected": -296.96673583984375, "loss": 0.3386, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4456048011779785, "rewards/margins": 1.4643114805221558, "rewards/rejected": -1.9099165201187134, "step": 2760 }, { "epoch": 2.86, "learning_rate": 2.564102564102564e-08, "logits/chosen": 23.153486251831055, "logits/rejected": 23.201038360595703, "logps/chosen": -308.52288818359375, "logps/rejected": -289.1993408203125, "loss": 0.3403, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3693477213382721, "rewards/margins": 1.4097144603729248, "rewards/rejected": -1.779062032699585, "step": 2770 }, { "epoch": 2.87, "learning_rate": 2.3727516264829695e-08, "logits/chosen": 22.77389907836914, "logits/rejected": 22.64432144165039, "logps/chosen": -388.85552978515625, "logps/rejected": -363.8034362792969, "loss": 0.3601, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.32134681940078735, "rewards/margins": 1.4628154039382935, "rewards/rejected": -1.784161925315857, "step": 2780 }, { "epoch": 2.88, "learning_rate": 2.1814006888633754e-08, "logits/chosen": 22.792617797851562, "logits/rejected": 22.739648818969727, "logps/chosen": -340.0162353515625, "logps/rejected": -269.2567443847656, "loss": 0.3476, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.3537456691265106, "rewards/margins": 1.403322696685791, "rewards/rejected": -1.7570682764053345, "step": 2790 }, { "epoch": 2.89, "learning_rate": 1.990049751243781e-08, "logits/chosen": 23.177873611450195, "logits/rejected": 23.13758087158203, "logps/chosen": -343.5755615234375, "logps/rejected": -288.75555419921875, "loss": 0.3566, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.39105096459388733, "rewards/margins": 1.0802559852600098, "rewards/rejected": -1.4713070392608643, "step": 2800 }, { "epoch": 2.89, "eval_logits/chosen": 23.187063217163086, "eval_logits/rejected": 23.043275833129883, "eval_logps/chosen": -359.0445251464844, "eval_logps/rejected": -290.0010070800781, "eval_loss": 0.4985302686691284, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -0.44743794202804565, "eval_rewards/margins": 1.144317388534546, "eval_rewards/rejected": -1.5917555093765259, "eval_runtime": 208.7121, "eval_samples_per_second": 9.583, "eval_steps_per_second": 0.302, "step": 2800 }, { "epoch": 2.9, "learning_rate": 1.7986988136241865e-08, "logits/chosen": 22.997955322265625, "logits/rejected": 23.055164337158203, "logps/chosen": -362.84918212890625, "logps/rejected": -298.51922607421875, "loss": 0.3433, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2676233649253845, "rewards/margins": 1.3830516338348389, "rewards/rejected": -1.650674819946289, "step": 2810 }, { "epoch": 2.91, "learning_rate": 1.6073478760045924e-08, "logits/chosen": 23.13959312438965, "logits/rejected": 22.91689682006836, "logps/chosen": -358.01666259765625, "logps/rejected": -246.228515625, "loss": 0.3319, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.24576838314533234, "rewards/margins": 1.788368582725525, "rewards/rejected": -2.03413724899292, "step": 2820 }, { "epoch": 2.92, "learning_rate": 1.4159969383849981e-08, "logits/chosen": 22.98459243774414, "logits/rejected": 23.025390625, "logps/chosen": -346.28973388671875, "logps/rejected": -279.1742858886719, "loss": 0.3331, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.40038982033729553, "rewards/margins": 1.4257802963256836, "rewards/rejected": -1.8261702060699463, "step": 2830 }, { "epoch": 2.93, "learning_rate": 1.2246460007654037e-08, "logits/chosen": 23.19771957397461, "logits/rejected": 23.1368408203125, "logps/chosen": -349.0854187011719, "logps/rejected": -281.1717529296875, "loss": 0.3685, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.481764018535614, "rewards/margins": 1.4477870464324951, "rewards/rejected": -1.929551124572754, "step": 2840 }, { "epoch": 2.94, "learning_rate": 1.0332950631458094e-08, "logits/chosen": 23.161651611328125, "logits/rejected": 23.000064849853516, "logps/chosen": -338.02288818359375, "logps/rejected": -283.4983215332031, "loss": 0.3501, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.45347729325294495, "rewards/margins": 1.2738498449325562, "rewards/rejected": -1.7273271083831787, "step": 2850 }, { "epoch": 2.95, "learning_rate": 8.419441255262151e-09, "logits/chosen": 22.9562931060791, "logits/rejected": 22.93158531188965, "logps/chosen": -301.29132080078125, "logps/rejected": -239.3927001953125, "loss": 0.3382, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.4900715947151184, "rewards/margins": 1.371382713317871, "rewards/rejected": -1.8614543676376343, "step": 2860 }, { "epoch": 2.96, "learning_rate": 6.505931879066207e-09, "logits/chosen": 22.94473648071289, "logits/rejected": 23.008617401123047, "logps/chosen": -302.16436767578125, "logps/rejected": -269.48828125, "loss": 0.3549, "rewards/accuracies": 0.8125, "rewards/chosen": -0.308788925409317, "rewards/margins": 1.3573650121688843, "rewards/rejected": -1.666154146194458, "step": 2870 }, { "epoch": 2.97, "learning_rate": 4.592422502870264e-09, "logits/chosen": 23.0222225189209, "logits/rejected": 22.979480743408203, "logps/chosen": -329.9389343261719, "logps/rejected": -272.41351318359375, "loss": 0.3559, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.35177531838417053, "rewards/margins": 1.4292513132095337, "rewards/rejected": -1.781026840209961, "step": 2880 }, { "epoch": 2.98, "learning_rate": 2.6789131266743202e-09, "logits/chosen": 23.136159896850586, "logits/rejected": 23.02133560180664, "logps/chosen": -328.77093505859375, "logps/rejected": -275.63995361328125, "loss": 0.3498, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.4196252226829529, "rewards/margins": 1.4171664714813232, "rewards/rejected": -1.8367916345596313, "step": 2890 }, { "epoch": 2.99, "learning_rate": 7.654037504783773e-10, "logits/chosen": 23.257701873779297, "logits/rejected": 23.05466079711914, "logps/chosen": -311.91217041015625, "logps/rejected": -304.32501220703125, "loss": 0.3386, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.36090317368507385, "rewards/margins": 1.4512748718261719, "rewards/rejected": -1.8121780157089233, "step": 2900 }, { "epoch": 2.99, "eval_logits/chosen": 23.18655014038086, "eval_logits/rejected": 23.042728424072266, "eval_logps/chosen": -359.16790771484375, "eval_logps/rejected": -290.12347412109375, "eval_loss": 0.4982847273349762, "eval_rewards/accuracies": 0.7817460298538208, "eval_rewards/chosen": -0.4597766697406769, "eval_rewards/margins": 1.144227385520935, "eval_rewards/rejected": -1.6040042638778687, "eval_runtime": 212.9399, "eval_samples_per_second": 9.392, "eval_steps_per_second": 0.296, "step": 2900 }, { "epoch": 3.0, "step": 2904, "total_flos": 0.0, "train_loss": 0.446941960284861, "train_runtime": 57869.3533, "train_samples_per_second": 3.212, "train_steps_per_second": 0.05 } ], "logging_steps": 10, "max_steps": 2904, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }