diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16068 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9997265418482124, + "eval_steps": 100, + "global_step": 10284, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.859086491739552e-10, + "logits/chosen": -2.7380757331848145, + "logits/rejected": -2.824676990509033, + "logps/chosen": -110.21537780761719, + "logps/rejected": -136.8306427001953, + "loss": 1.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.859086491739553e-09, + "logits/chosen": -2.7822985649108887, + "logits/rejected": -2.781153917312622, + "logps/chosen": -221.65379333496094, + "logps/rejected": -202.54539489746094, + "loss": 1.0001, + "rewards/accuracies": 0.3576388955116272, + "rewards/chosen": 0.0013104991521686316, + "rewards/margins": 0.0014001904055476189, + "rewards/rejected": -8.969102054834366e-05, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 9.718172983479106e-09, + "logits/chosen": -2.7663846015930176, + "logits/rejected": -2.757770299911499, + "logps/chosen": -199.00106811523438, + "logps/rejected": -185.31143188476562, + "loss": 1.0015, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.001889596926048398, + "rewards/margins": -0.00209548557177186, + "rewards/rejected": 0.0002058891550404951, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.4577259475218657e-08, + "logits/chosen": -2.8105146884918213, + "logits/rejected": -2.7981715202331543, + "logps/chosen": -208.05624389648438, + "logps/rejected": -192.64044189453125, + "loss": 1.0002, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": -0.003374215215444565, + "rewards/margins": -0.0016945224488154054, + "rewards/rejected": -0.0016796926502138376, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 1.943634596695821e-08, + "logits/chosen": -2.7664506435394287, + "logits/rejected": -2.7862141132354736, + "logps/chosen": -197.5291290283203, + "logps/rejected": -188.8165740966797, + "loss": 1.002, + "rewards/accuracies": 0.4468750059604645, + "rewards/chosen": -0.0014407314592972398, + "rewards/margins": -0.00430614547803998, + "rewards/rejected": 0.0028654143679887056, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 2.4295432458697764e-08, + "logits/chosen": -2.776463270187378, + "logits/rejected": -2.7792251110076904, + "logps/chosen": -201.38197326660156, + "logps/rejected": -189.93276977539062, + "loss": 0.999, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.0018952597165480256, + "rewards/margins": 0.00021468903287313879, + "rewards/rejected": 0.0016805704217404127, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 2.9154518950437314e-08, + "logits/chosen": -2.7934350967407227, + "logits/rejected": -2.7988104820251465, + "logps/chosen": -206.02767944335938, + "logps/rejected": -189.4266357421875, + "loss": 1.0014, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.0006429950590245426, + "rewards/margins": -0.0019359359284862876, + "rewards/rejected": 0.0012929405784234405, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 3.4013605442176873e-08, + "logits/chosen": -2.78006649017334, + "logits/rejected": -2.7711257934570312, + "logps/chosen": -200.0697784423828, + "logps/rejected": -180.75758361816406, + "loss": 0.9966, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -9.763417619979009e-05, + "rewards/margins": 0.0023461231030523777, + "rewards/rejected": -0.002443757839500904, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 3.887269193391642e-08, + "logits/chosen": -2.7850944995880127, + "logits/rejected": -2.7837047576904297, + "logps/chosen": -192.77597045898438, + "logps/rejected": -182.43698120117188, + "loss": 0.9967, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.003315262496471405, + "rewards/margins": 0.005001432728022337, + "rewards/rejected": -0.0016861699987202883, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 4.373177842565597e-08, + "logits/chosen": -2.7849223613739014, + "logits/rejected": -2.765354633331299, + "logps/chosen": -200.2301788330078, + "logps/rejected": -183.25489807128906, + "loss": 0.9955, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": 0.002481408417224884, + "rewards/margins": 0.0061470940709114075, + "rewards/rejected": -0.0036656856536865234, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 4.859086491739553e-08, + "logits/chosen": -2.7850663661956787, + "logits/rejected": -2.777864694595337, + "logps/chosen": -211.11416625976562, + "logps/rejected": -191.1077423095703, + "loss": 0.9958, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.001670264988206327, + "rewards/margins": 0.0035112425684928894, + "rewards/rejected": -0.0018409776967018843, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.6924479007720947, + "eval_logits/rejected": -2.6876208782196045, + "eval_logps/chosen": -195.60777282714844, + "eval_logps/rejected": -180.92320251464844, + "eval_loss": 1.0002543926239014, + "eval_rewards/accuracies": 0.49303194880485535, + "eval_rewards/chosen": -0.000245044706389308, + "eval_rewards/margins": -5.4176409321371466e-05, + "eval_rewards/rejected": -0.00019086812972091138, + "eval_runtime": 444.5306, + "eval_samples_per_second": 26.466, + "eval_steps_per_second": 3.309, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 5.344995140913508e-08, + "logits/chosen": -2.7685182094573975, + "logits/rejected": -2.7815494537353516, + "logps/chosen": -203.09451293945312, + "logps/rejected": -192.09835815429688, + "loss": 1.0022, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.0015572088304907084, + "rewards/margins": -0.0013193445047363639, + "rewards/rejected": -0.00023786406381987035, + "step": 110 + }, + { + "epoch": 0.04, + "learning_rate": 5.830903790087463e-08, + "logits/chosen": -2.7377212047576904, + "logits/rejected": -2.7259089946746826, + "logps/chosen": -197.40304565429688, + "logps/rejected": -177.4573211669922, + "loss": 1.0007, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.0015263364184647799, + "rewards/margins": 0.0014055164065212011, + "rewards/rejected": -0.0029318523593246937, + "step": 120 + }, + { + "epoch": 0.04, + "learning_rate": 6.316812439261418e-08, + "logits/chosen": -2.781754970550537, + "logits/rejected": -2.780665397644043, + "logps/chosen": -206.015869140625, + "logps/rejected": -194.5734405517578, + "loss": 1.001, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 0.0026487892027944326, + "rewards/margins": 0.001911659142933786, + "rewards/rejected": 0.000737129885237664, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 6.802721088435375e-08, + "logits/chosen": -2.7300639152526855, + "logits/rejected": -2.7538230419158936, + "logps/chosen": -193.50584411621094, + "logps/rejected": -173.8282012939453, + "loss": 1.0005, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": -0.0014676448190584779, + "rewards/margins": -0.001340946415439248, + "rewards/rejected": -0.0001266980543732643, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 7.28862973760933e-08, + "logits/chosen": -2.780795097351074, + "logits/rejected": -2.7789599895477295, + "logps/chosen": -186.2657012939453, + "logps/rejected": -176.09500122070312, + "loss": 0.9997, + "rewards/accuracies": 0.46562498807907104, + "rewards/chosen": -0.006670904811471701, + "rewards/margins": -0.003007309976965189, + "rewards/rejected": -0.0036635962314903736, + "step": 150 + }, + { + "epoch": 0.05, + "learning_rate": 7.774538386783285e-08, + "logits/chosen": -2.7663471698760986, + "logits/rejected": -2.7770943641662598, + "logps/chosen": -170.55154418945312, + "logps/rejected": -173.17251586914062, + "loss": 1.0013, + "rewards/accuracies": 0.4781250059604645, + "rewards/chosen": -0.0022262814454734325, + "rewards/margins": -0.002861147280782461, + "rewards/rejected": 0.0006348658935166895, + "step": 160 + }, + { + "epoch": 0.05, + "learning_rate": 8.26044703595724e-08, + "logits/chosen": -2.7730824947357178, + "logits/rejected": -2.769960403442383, + "logps/chosen": -221.2248077392578, + "logps/rejected": -201.74874877929688, + "loss": 0.9999, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.002443634672090411, + "rewards/margins": 0.0040279231034219265, + "rewards/rejected": -0.0015842880820855498, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 8.746355685131194e-08, + "logits/chosen": -2.76259183883667, + "logits/rejected": -2.7266428470611572, + "logps/chosen": -207.81698608398438, + "logps/rejected": -173.13430786132812, + "loss": 1.0002, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.002216937020421028, + "rewards/margins": 0.001683462760411203, + "rewards/rejected": 0.0005334746092557907, + "step": 180 + }, + { + "epoch": 0.06, + "learning_rate": 9.23226433430515e-08, + "logits/chosen": -2.7751212120056152, + "logits/rejected": -2.7650701999664307, + "logps/chosen": -186.60348510742188, + "logps/rejected": -167.40306091308594, + "loss": 0.9997, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": 0.000979422009550035, + "rewards/margins": -0.0013022356433793902, + "rewards/rejected": 0.0022816576529294252, + "step": 190 + }, + { + "epoch": 0.06, + "learning_rate": 9.718172983479106e-08, + "logits/chosen": -2.757312297821045, + "logits/rejected": -2.7516403198242188, + "logps/chosen": -202.6693572998047, + "logps/rejected": -188.0704345703125, + "loss": 0.9984, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": 0.0023301562760025263, + "rewards/margins": 0.0031469545792788267, + "rewards/rejected": -0.0008167977211996913, + "step": 200 + }, + { + "epoch": 0.06, + "eval_logits/chosen": -2.6837515830993652, + "eval_logits/rejected": -2.6787424087524414, + "eval_logps/chosen": -195.6126708984375, + "eval_logps/rejected": -180.9346923828125, + "eval_loss": 0.9995196461677551, + "eval_rewards/accuracies": 0.4988103210926056, + "eval_rewards/chosen": -0.0007361570023931563, + "eval_rewards/margins": 0.0006052808603271842, + "eval_rewards/rejected": -0.0013414378045126796, + "eval_runtime": 443.7511, + "eval_samples_per_second": 26.513, + "eval_steps_per_second": 3.315, + "step": 200 + }, + { + "epoch": 0.06, + "learning_rate": 1.0204081632653061e-07, + "logits/chosen": -2.759521007537842, + "logits/rejected": -2.762321949005127, + "logps/chosen": -192.76931762695312, + "logps/rejected": -187.10482788085938, + "loss": 0.999, + "rewards/accuracies": 0.4468750059604645, + "rewards/chosen": -0.0026286139618605375, + "rewards/margins": -0.003410499542951584, + "rewards/rejected": 0.0007818859303370118, + "step": 210 + }, + { + "epoch": 0.06, + "learning_rate": 1.0689990281827016e-07, + "logits/chosen": -2.746939182281494, + "logits/rejected": -2.776545763015747, + "logps/chosen": -178.4153289794922, + "logps/rejected": -182.76333618164062, + "loss": 1.0002, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.001649575075134635, + "rewards/margins": 0.0006654064636677504, + "rewards/rejected": -0.0023149813059717417, + "step": 220 + }, + { + "epoch": 0.07, + "learning_rate": 1.117589893100097e-07, + "logits/chosen": -2.767946720123291, + "logits/rejected": -2.777435779571533, + "logps/chosen": -185.25881958007812, + "logps/rejected": -184.54998779296875, + "loss": 1.0031, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0033680996857583523, + "rewards/margins": -0.002976976800709963, + "rewards/rejected": -0.00039112247759476304, + "step": 230 + }, + { + "epoch": 0.07, + "learning_rate": 1.1661807580174926e-07, + "logits/chosen": -2.761780023574829, + "logits/rejected": -2.804439067840576, + "logps/chosen": -192.47836303710938, + "logps/rejected": -198.77731323242188, + "loss": 0.997, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.0017596991965547204, + "rewards/margins": -0.0007232691277749836, + "rewards/rejected": -0.0010364304762333632, + "step": 240 + }, + { + "epoch": 0.07, + "learning_rate": 1.2147716229348882e-07, + "logits/chosen": -2.7784340381622314, + "logits/rejected": -2.77835750579834, + "logps/chosen": -192.2952423095703, + "logps/rejected": -178.8677520751953, + "loss": 0.9982, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.0018869973719120026, + "rewards/margins": 0.001274367212317884, + "rewards/rejected": -0.003161365631967783, + "step": 250 + }, + { + "epoch": 0.08, + "learning_rate": 1.2633624878522837e-07, + "logits/chosen": -2.7689738273620605, + "logits/rejected": -2.777843952178955, + "logps/chosen": -207.0860595703125, + "logps/rejected": -197.57162475585938, + "loss": 1.0012, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.00039492687210440636, + "rewards/margins": 0.0005041843978688121, + "rewards/rejected": -0.00010925752576440573, + "step": 260 + }, + { + "epoch": 0.08, + "learning_rate": 1.3119533527696792e-07, + "logits/chosen": -2.788908004760742, + "logits/rejected": -2.7591609954833984, + "logps/chosen": -226.4412841796875, + "logps/rejected": -188.81068420410156, + "loss": 0.9992, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.002002813620492816, + "rewards/margins": 0.0019938002806156874, + "rewards/rejected": -0.003996613435447216, + "step": 270 + }, + { + "epoch": 0.08, + "learning_rate": 1.360544217687075e-07, + "logits/chosen": -2.7908740043640137, + "logits/rejected": -2.779235601425171, + "logps/chosen": -202.7140655517578, + "logps/rejected": -184.89212036132812, + "loss": 0.9963, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.0004921169602312148, + "rewards/margins": 0.0021473595406860113, + "rewards/rejected": -0.002639476442709565, + "step": 280 + }, + { + "epoch": 0.08, + "learning_rate": 1.4091350826044704e-07, + "logits/chosen": -2.7945303916931152, + "logits/rejected": -2.7873740196228027, + "logps/chosen": -206.4544219970703, + "logps/rejected": -197.7809295654297, + "loss": 0.9997, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.00042502378346398473, + "rewards/margins": -0.0009619802003726363, + "rewards/rejected": 0.0005369562422856688, + "step": 290 + }, + { + "epoch": 0.09, + "learning_rate": 1.457725947521866e-07, + "logits/chosen": -2.7498443126678467, + "logits/rejected": -2.750371217727661, + "logps/chosen": -192.40003967285156, + "logps/rejected": -173.42335510253906, + "loss": 0.9982, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": 0.0006312422920018435, + "rewards/margins": -3.3728498237906024e-05, + "rewards/rejected": 0.0006649707793258131, + "step": 300 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -2.689706802368164, + "eval_logits/rejected": -2.6848087310791016, + "eval_logps/chosen": -195.61361694335938, + "eval_logps/rejected": -180.93612670898438, + "eval_loss": 0.9996600151062012, + "eval_rewards/accuracies": 0.4983004629611969, + "eval_rewards/chosen": -0.0008316952735185623, + "eval_rewards/margins": 0.000652860093396157, + "eval_rewards/rejected": -0.001484555541537702, + "eval_runtime": 443.5992, + "eval_samples_per_second": 26.522, + "eval_steps_per_second": 3.316, + "step": 300 + }, + { + "epoch": 0.09, + "learning_rate": 1.5063168124392614e-07, + "logits/chosen": -2.750749111175537, + "logits/rejected": -2.7541632652282715, + "logps/chosen": -194.2585906982422, + "logps/rejected": -173.24342346191406, + "loss": 1.0014, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.004035498481243849, + "rewards/margins": -0.003319731680676341, + "rewards/rejected": -0.0007157664513215423, + "step": 310 + }, + { + "epoch": 0.09, + "learning_rate": 1.554907677356657e-07, + "logits/chosen": -2.7510886192321777, + "logits/rejected": -2.7908897399902344, + "logps/chosen": -163.4540557861328, + "logps/rejected": -183.45425415039062, + "loss": 0.9994, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": 0.0019172386964783072, + "rewards/margins": 0.0012772420886904001, + "rewards/rejected": 0.0006399970734491944, + "step": 320 + }, + { + "epoch": 0.1, + "learning_rate": 1.6034985422740524e-07, + "logits/chosen": -2.730457067489624, + "logits/rejected": -2.754711627960205, + "logps/chosen": -173.8527069091797, + "logps/rejected": -189.8472900390625, + "loss": 0.9978, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.0029088188894093037, + "rewards/margins": 0.0033104638569056988, + "rewards/rejected": -0.006219283677637577, + "step": 330 + }, + { + "epoch": 0.1, + "learning_rate": 1.652089407191448e-07, + "logits/chosen": -2.76922607421875, + "logits/rejected": -2.7422194480895996, + "logps/chosen": -194.23680114746094, + "logps/rejected": -160.4568328857422, + "loss": 1.0014, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.0046670883893966675, + "rewards/margins": -0.00291821570135653, + "rewards/rejected": -0.0017488717567175627, + "step": 340 + }, + { + "epoch": 0.1, + "learning_rate": 1.7006802721088434e-07, + "logits/chosen": -2.771911382675171, + "logits/rejected": -2.798151969909668, + "logps/chosen": -181.0185546875, + "logps/rejected": -178.7841339111328, + "loss": 1.0032, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.0018037393456324935, + "rewards/margins": -0.001680979155935347, + "rewards/rejected": -0.00012276046618353575, + "step": 350 + }, + { + "epoch": 0.11, + "learning_rate": 1.749271137026239e-07, + "logits/chosen": -2.812619209289551, + "logits/rejected": -2.79461669921875, + "logps/chosen": -223.05697631835938, + "logps/rejected": -200.67198181152344, + "loss": 1.0019, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.004876718390733004, + "rewards/margins": -0.0017235095147043467, + "rewards/rejected": -0.0031532091088593006, + "step": 360 + }, + { + "epoch": 0.11, + "learning_rate": 1.7978620019436344e-07, + "logits/chosen": -2.7639615535736084, + "logits/rejected": -2.761749029159546, + "logps/chosen": -191.48397827148438, + "logps/rejected": -174.09100341796875, + "loss": 0.9953, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.0004648994654417038, + "rewards/margins": 0.003035474568605423, + "rewards/rejected": -0.003500374499708414, + "step": 370 + }, + { + "epoch": 0.11, + "learning_rate": 1.84645286686103e-07, + "logits/chosen": -2.7765164375305176, + "logits/rejected": -2.7844157218933105, + "logps/chosen": -183.25009155273438, + "logps/rejected": -182.92747497558594, + "loss": 1.0008, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.0028205744456499815, + "rewards/margins": -0.003933761268854141, + "rewards/rejected": 0.0011131864739581943, + "step": 380 + }, + { + "epoch": 0.11, + "learning_rate": 1.8950437317784256e-07, + "logits/chosen": -2.7915711402893066, + "logits/rejected": -2.7761950492858887, + "logps/chosen": -198.72650146484375, + "logps/rejected": -186.8263702392578, + "loss": 0.9976, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": -0.0008961642161011696, + "rewards/margins": 0.0019898409955203533, + "rewards/rejected": -0.002886005211621523, + "step": 390 + }, + { + "epoch": 0.12, + "learning_rate": 1.9436345966958211e-07, + "logits/chosen": -2.768798351287842, + "logits/rejected": -2.805954933166504, + "logps/chosen": -187.5673065185547, + "logps/rejected": -199.96156311035156, + "loss": 0.9966, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.003487077308818698, + "rewards/margins": 0.0017222666647285223, + "rewards/rejected": -0.0052093444392085075, + "step": 400 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -2.6913726329803467, + "eval_logits/rejected": -2.6865181922912598, + "eval_logps/chosen": -195.6290740966797, + "eval_logps/rejected": -180.9485321044922, + "eval_loss": 0.9998844265937805, + "eval_rewards/accuracies": 0.4994901418685913, + "eval_rewards/chosen": -0.0023770283441990614, + "eval_rewards/margins": 0.0003477816062513739, + "eval_rewards/rejected": -0.0027248100377619267, + "eval_runtime": 443.4885, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 400 + }, + { + "epoch": 0.12, + "learning_rate": 1.9922254616132166e-07, + "logits/chosen": -2.7575571537017822, + "logits/rejected": -2.7523703575134277, + "logps/chosen": -186.25596618652344, + "logps/rejected": -171.8108673095703, + "loss": 1.0021, + "rewards/accuracies": 0.4593749940395355, + "rewards/chosen": -0.0011599508579820395, + "rewards/margins": 0.0005078490357846022, + "rewards/rejected": -0.0016678001265972853, + "step": 410 + }, + { + "epoch": 0.12, + "learning_rate": 2.0408163265306121e-07, + "logits/chosen": -2.7704014778137207, + "logits/rejected": -2.745540142059326, + "logps/chosen": -188.27548217773438, + "logps/rejected": -167.63754272460938, + "loss": 0.9996, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.005081222392618656, + "rewards/margins": -0.0013192463666200638, + "rewards/rejected": -0.0037619750946760178, + "step": 420 + }, + { + "epoch": 0.13, + "learning_rate": 2.0894071914480076e-07, + "logits/chosen": -2.7747130393981934, + "logits/rejected": -2.773439407348633, + "logps/chosen": -201.019775390625, + "logps/rejected": -184.5673065185547, + "loss": 0.9988, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.004013075493276119, + "rewards/margins": 0.0002744763041846454, + "rewards/rejected": -0.004287551622837782, + "step": 430 + }, + { + "epoch": 0.13, + "learning_rate": 2.137998056365403e-07, + "logits/chosen": -2.776340961456299, + "logits/rejected": -2.7776546478271484, + "logps/chosen": -194.1025390625, + "logps/rejected": -183.6971893310547, + "loss": 0.9964, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.0036060716956853867, + "rewards/margins": -0.00018001366697717458, + "rewards/rejected": -0.0034260577522218227, + "step": 440 + }, + { + "epoch": 0.13, + "learning_rate": 2.1865889212827986e-07, + "logits/chosen": -2.745941638946533, + "logits/rejected": -2.7574284076690674, + "logps/chosen": -183.247802734375, + "logps/rejected": -176.6983184814453, + "loss": 0.9968, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.0049474723637104034, + "rewards/margins": -0.001293553039431572, + "rewards/rejected": -0.0036539186257869005, + "step": 450 + }, + { + "epoch": 0.13, + "learning_rate": 2.235179786200194e-07, + "logits/chosen": -2.769151210784912, + "logits/rejected": -2.7719428539276123, + "logps/chosen": -178.2666015625, + "logps/rejected": -168.8743438720703, + "loss": 0.9979, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.0005593408131971955, + "rewards/margins": 0.002134414855390787, + "rewards/rejected": -0.002693755552172661, + "step": 460 + }, + { + "epoch": 0.14, + "learning_rate": 2.2837706511175896e-07, + "logits/chosen": -2.783510684967041, + "logits/rejected": -2.7758355140686035, + "logps/chosen": -219.9918975830078, + "logps/rejected": -187.12355041503906, + "loss": 1.0, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.0005246883956715465, + "rewards/margins": 0.002583832014352083, + "rewards/rejected": -0.0031085205264389515, + "step": 470 + }, + { + "epoch": 0.14, + "learning_rate": 2.332361516034985e-07, + "logits/chosen": -2.7807018756866455, + "logits/rejected": -2.769526481628418, + "logps/chosen": -191.76307678222656, + "logps/rejected": -172.55288696289062, + "loss": 0.9995, + "rewards/accuracies": 0.4593749940395355, + "rewards/chosen": -0.0057826414704322815, + "rewards/margins": -0.003593266010284424, + "rewards/rejected": -0.002189375925809145, + "step": 480 + }, + { + "epoch": 0.14, + "learning_rate": 2.3809523809523806e-07, + "logits/chosen": -2.7531919479370117, + "logits/rejected": -2.7485499382019043, + "logps/chosen": -211.37136840820312, + "logps/rejected": -183.17626953125, + "loss": 0.9984, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.002168575767427683, + "rewards/margins": 0.0029125846922397614, + "rewards/rejected": -0.005081160459667444, + "step": 490 + }, + { + "epoch": 0.15, + "learning_rate": 2.4295432458697764e-07, + "logits/chosen": -2.8140952587127686, + "logits/rejected": -2.7944469451904297, + "logps/chosen": -215.98110961914062, + "logps/rejected": -196.25465393066406, + "loss": 0.9992, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": -0.00415486516430974, + "rewards/margins": -8.980366692412645e-05, + "rewards/rejected": -0.004065061453729868, + "step": 500 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -2.6694087982177734, + "eval_logits/rejected": -2.6641385555267334, + "eval_logps/chosen": -195.6439971923828, + "eval_logps/rejected": -180.9752960205078, + "eval_loss": 0.9983938932418823, + "eval_rewards/accuracies": 0.5122365951538086, + "eval_rewards/chosen": -0.003868211293593049, + "eval_rewards/margins": 0.0015316897770389915, + "eval_rewards/rejected": -0.005399900488555431, + "eval_runtime": 443.5731, + "eval_samples_per_second": 26.523, + "eval_steps_per_second": 3.316, + "step": 500 + }, + { + "epoch": 0.15, + "learning_rate": 2.478134110787172e-07, + "logits/chosen": -2.7879891395568848, + "logits/rejected": -2.7984836101531982, + "logps/chosen": -203.31301879882812, + "logps/rejected": -215.4292755126953, + "loss": 0.9961, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.003516793716698885, + "rewards/margins": 0.005363177973777056, + "rewards/rejected": -0.00887997169047594, + "step": 510 + }, + { + "epoch": 0.15, + "learning_rate": 2.5267249757045674e-07, + "logits/chosen": -2.7599759101867676, + "logits/rejected": -2.75394868850708, + "logps/chosen": -167.49600219726562, + "logps/rejected": -158.7400665283203, + "loss": 1.0014, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.0029489509761333466, + "rewards/margins": 0.004199314396828413, + "rewards/rejected": -0.007148265838623047, + "step": 520 + }, + { + "epoch": 0.15, + "learning_rate": 2.575315840621963e-07, + "logits/chosen": -2.7890686988830566, + "logits/rejected": -2.7895076274871826, + "logps/chosen": -210.8441162109375, + "logps/rejected": -190.7526397705078, + "loss": 1.0027, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.008806548081338406, + "rewards/margins": -0.006835663225501776, + "rewards/rejected": -0.0019708843901753426, + "step": 530 + }, + { + "epoch": 0.16, + "learning_rate": 2.6239067055393583e-07, + "logits/chosen": -2.7971768379211426, + "logits/rejected": -2.7915546894073486, + "logps/chosen": -206.54354858398438, + "logps/rejected": -194.00369262695312, + "loss": 0.9982, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": -0.005383323412388563, + "rewards/margins": 9.738374501466751e-06, + "rewards/rejected": -0.005393061321228743, + "step": 540 + }, + { + "epoch": 0.16, + "learning_rate": 2.6724975704567544e-07, + "logits/chosen": -2.7736599445343018, + "logits/rejected": -2.764831781387329, + "logps/chosen": -214.5034942626953, + "logps/rejected": -188.35311889648438, + "loss": 0.9938, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.003384556155651808, + "rewards/margins": 0.004990326706320047, + "rewards/rejected": -0.008374882861971855, + "step": 550 + }, + { + "epoch": 0.16, + "learning_rate": 2.72108843537415e-07, + "logits/chosen": -2.7674479484558105, + "logits/rejected": -2.767453670501709, + "logps/chosen": -185.07174682617188, + "logps/rejected": -179.09393310546875, + "loss": 0.9974, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": -0.005933644715696573, + "rewards/margins": 0.0022817221470177174, + "rewards/rejected": -0.00821536686271429, + "step": 560 + }, + { + "epoch": 0.17, + "learning_rate": 2.7696793002915454e-07, + "logits/chosen": -2.7519259452819824, + "logits/rejected": -2.7778701782226562, + "logps/chosen": -177.8760986328125, + "logps/rejected": -182.38775634765625, + "loss": 0.9915, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.005426911171525717, + "rewards/margins": 0.010609583929181099, + "rewards/rejected": -0.016036493703722954, + "step": 570 + }, + { + "epoch": 0.17, + "learning_rate": 2.818270165208941e-07, + "logits/chosen": -2.7523977756500244, + "logits/rejected": -2.7401721477508545, + "logps/chosen": -192.66586303710938, + "logps/rejected": -169.11270141601562, + "loss": 0.9977, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": -0.0048529463820159435, + "rewards/margins": 0.004674314521253109, + "rewards/rejected": -0.00952726136893034, + "step": 580 + }, + { + "epoch": 0.17, + "learning_rate": 2.8668610301263364e-07, + "logits/chosen": -2.7547593116760254, + "logits/rejected": -2.7581233978271484, + "logps/chosen": -192.94606018066406, + "logps/rejected": -190.77268981933594, + "loss": 0.9969, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.0011147389886900783, + "rewards/margins": 0.004964867141097784, + "rewards/rejected": -0.006079606246203184, + "step": 590 + }, + { + "epoch": 0.18, + "learning_rate": 2.915451895043732e-07, + "logits/chosen": -2.7692792415618896, + "logits/rejected": -2.765631914138794, + "logps/chosen": -190.91696166992188, + "logps/rejected": -173.8136444091797, + "loss": 0.9983, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.009905226528644562, + "rewards/margins": 0.000821866444312036, + "rewards/rejected": -0.010727094486355782, + "step": 600 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.6910696029663086, + "eval_logits/rejected": -2.686223030090332, + "eval_logps/chosen": -195.6588592529297, + "eval_logps/rejected": -180.99447631835938, + "eval_loss": 0.9981443285942078, + "eval_rewards/accuracies": 0.5127464532852173, + "eval_rewards/chosen": -0.005352581851184368, + "eval_rewards/margins": 0.0019660205580294132, + "eval_rewards/rejected": -0.007318601943552494, + "eval_runtime": 443.4182, + "eval_samples_per_second": 26.533, + "eval_steps_per_second": 3.317, + "step": 600 + }, + { + "epoch": 0.18, + "learning_rate": 2.9640427599611273e-07, + "logits/chosen": -2.7499747276306152, + "logits/rejected": -2.7771193981170654, + "logps/chosen": -170.37429809570312, + "logps/rejected": -174.40542602539062, + "loss": 0.9969, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.0075881704688072205, + "rewards/margins": -0.0003000420401804149, + "rewards/rejected": -0.007288129068911076, + "step": 610 + }, + { + "epoch": 0.18, + "learning_rate": 3.012633624878523e-07, + "logits/chosen": -2.7855448722839355, + "logits/rejected": -2.774034023284912, + "logps/chosen": -199.66506958007812, + "logps/rejected": -181.94955444335938, + "loss": 0.9963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00368059566244483, + "rewards/margins": 0.001826319145038724, + "rewards/rejected": -0.0055069150403141975, + "step": 620 + }, + { + "epoch": 0.18, + "learning_rate": 3.0612244897959183e-07, + "logits/chosen": -2.7524774074554443, + "logits/rejected": -2.7855701446533203, + "logps/chosen": -176.02008056640625, + "logps/rejected": -177.9148712158203, + "loss": 0.9963, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.004544141236692667, + "rewards/margins": 0.0032399215269833803, + "rewards/rejected": -0.007784062065184116, + "step": 630 + }, + { + "epoch": 0.19, + "learning_rate": 3.109815354713314e-07, + "logits/chosen": -2.7449309825897217, + "logits/rejected": -2.750732183456421, + "logps/chosen": -181.66734313964844, + "logps/rejected": -168.88841247558594, + "loss": 0.9945, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0030544609762728214, + "rewards/margins": 0.00868634507060051, + "rewards/rejected": -0.011740805581212044, + "step": 640 + }, + { + "epoch": 0.19, + "learning_rate": 3.1584062196307093e-07, + "logits/chosen": -2.777808904647827, + "logits/rejected": -2.7611007690429688, + "logps/chosen": -202.5792999267578, + "logps/rejected": -188.46258544921875, + "loss": 0.9955, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.005264073144644499, + "rewards/margins": 0.005189122632145882, + "rewards/rejected": -0.010453195311129093, + "step": 650 + }, + { + "epoch": 0.19, + "learning_rate": 3.206997084548105e-07, + "logits/chosen": -2.788951873779297, + "logits/rejected": -2.7755539417266846, + "logps/chosen": -198.77944946289062, + "logps/rejected": -177.9780731201172, + "loss": 0.9997, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": -0.00859787967056036, + "rewards/margins": 0.0023240833543241024, + "rewards/rejected": -0.01092196349054575, + "step": 660 + }, + { + "epoch": 0.2, + "learning_rate": 3.2555879494655003e-07, + "logits/chosen": -2.771275281906128, + "logits/rejected": -2.778935432434082, + "logps/chosen": -208.5783233642578, + "logps/rejected": -192.476318359375, + "loss": 0.998, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.009211702272295952, + "rewards/margins": 0.0008993959054350853, + "rewards/rejected": -0.010111097246408463, + "step": 670 + }, + { + "epoch": 0.2, + "learning_rate": 3.304178814382896e-07, + "logits/chosen": -2.7745091915130615, + "logits/rejected": -2.7752413749694824, + "logps/chosen": -186.6476593017578, + "logps/rejected": -172.84085083007812, + "loss": 0.9931, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.00590853113681078, + "rewards/margins": 0.007229860872030258, + "rewards/rejected": -0.013138392940163612, + "step": 680 + }, + { + "epoch": 0.2, + "learning_rate": 3.3527696793002913e-07, + "logits/chosen": -2.7487292289733887, + "logits/rejected": -2.753268241882324, + "logps/chosen": -194.5361785888672, + "logps/rejected": -178.95840454101562, + "loss": 0.9953, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.009363415651023388, + "rewards/margins": 0.007163605652749538, + "rewards/rejected": -0.016527021303772926, + "step": 690 + }, + { + "epoch": 0.2, + "learning_rate": 3.401360544217687e-07, + "logits/chosen": -2.735527515411377, + "logits/rejected": -2.7788593769073486, + "logps/chosen": -190.52822875976562, + "logps/rejected": -198.77719116210938, + "loss": 0.9968, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.006488684564828873, + "rewards/margins": 0.003683714661747217, + "rewards/rejected": -0.010172399692237377, + "step": 700 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -2.6803455352783203, + "eval_logits/rejected": -2.67529559135437, + "eval_logps/chosen": -195.698486328125, + "eval_logps/rejected": -181.0484619140625, + "eval_loss": 0.997223436832428, + "eval_rewards/accuracies": 0.5241332650184631, + "eval_rewards/chosen": -0.009316603653132915, + "eval_rewards/margins": 0.003401679452508688, + "eval_rewards/rejected": -0.01271828357130289, + "eval_runtime": 443.593, + "eval_samples_per_second": 26.522, + "eval_steps_per_second": 3.316, + "step": 700 + }, + { + "epoch": 0.21, + "learning_rate": 3.4499514091350823e-07, + "logits/chosen": -2.774488687515259, + "logits/rejected": -2.774510145187378, + "logps/chosen": -203.57730102539062, + "logps/rejected": -191.86526489257812, + "loss": 0.9927, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.007745369337499142, + "rewards/margins": 0.007565264590084553, + "rewards/rejected": -0.015310634858906269, + "step": 710 + }, + { + "epoch": 0.21, + "learning_rate": 3.498542274052478e-07, + "logits/chosen": -2.7597362995147705, + "logits/rejected": -2.7930312156677246, + "logps/chosen": -195.6759033203125, + "logps/rejected": -205.12081909179688, + "loss": 0.9979, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.008192415349185467, + "rewards/margins": 0.007148954086005688, + "rewards/rejected": -0.015341369435191154, + "step": 720 + }, + { + "epoch": 0.21, + "learning_rate": 3.5471331389698733e-07, + "logits/chosen": -2.7899694442749023, + "logits/rejected": -2.7742934226989746, + "logps/chosen": -236.7986297607422, + "logps/rejected": -213.5732421875, + "loss": 0.9908, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.0075895837508141994, + "rewards/margins": 0.007327827624976635, + "rewards/rejected": -0.014917412772774696, + "step": 730 + }, + { + "epoch": 0.22, + "learning_rate": 3.595724003887269e-07, + "logits/chosen": -2.775095224380493, + "logits/rejected": -2.7730841636657715, + "logps/chosen": -196.58554077148438, + "logps/rejected": -191.58370971679688, + "loss": 0.9989, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.016460472717881203, + "rewards/margins": -0.00014977165847085416, + "rewards/rejected": -0.01631070114672184, + "step": 740 + }, + { + "epoch": 0.22, + "learning_rate": 3.6443148688046643e-07, + "logits/chosen": -2.747368335723877, + "logits/rejected": -2.7749929428100586, + "logps/chosen": -182.0715789794922, + "logps/rejected": -181.74368286132812, + "loss": 0.994, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.007714638952165842, + "rewards/margins": 0.008383364416658878, + "rewards/rejected": -0.016098003834486008, + "step": 750 + }, + { + "epoch": 0.22, + "learning_rate": 3.69290573372206e-07, + "logits/chosen": -2.7606258392333984, + "logits/rejected": -2.7410714626312256, + "logps/chosen": -195.3123016357422, + "logps/rejected": -167.84469604492188, + "loss": 1.0003, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.014703328721225262, + "rewards/margins": 0.0018827319145202637, + "rewards/rejected": -0.01658605970442295, + "step": 760 + }, + { + "epoch": 0.22, + "learning_rate": 3.741496598639456e-07, + "logits/chosen": -2.8003480434417725, + "logits/rejected": -2.801697015762329, + "logps/chosen": -200.1820068359375, + "logps/rejected": -193.44102478027344, + "loss": 0.9936, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.012818267568945885, + "rewards/margins": 0.005492014344781637, + "rewards/rejected": -0.01831028237938881, + "step": 770 + }, + { + "epoch": 0.23, + "learning_rate": 3.7900874635568513e-07, + "logits/chosen": -2.736542224884033, + "logits/rejected": -2.7550294399261475, + "logps/chosen": -184.42022705078125, + "logps/rejected": -178.26968383789062, + "loss": 0.9981, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.011115864850580692, + "rewards/margins": 0.005478517152369022, + "rewards/rejected": -0.016594382002949715, + "step": 780 + }, + { + "epoch": 0.23, + "learning_rate": 3.838678328474247e-07, + "logits/chosen": -2.7842249870300293, + "logits/rejected": -2.7937369346618652, + "logps/chosen": -192.46023559570312, + "logps/rejected": -186.9135284423828, + "loss": 0.9998, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.012151426635682583, + "rewards/margins": 0.0024247639812529087, + "rewards/rejected": -0.014576191082596779, + "step": 790 + }, + { + "epoch": 0.23, + "learning_rate": 3.8872691933916423e-07, + "logits/chosen": -2.7808454036712646, + "logits/rejected": -2.7431931495666504, + "logps/chosen": -218.29647827148438, + "logps/rejected": -184.58609008789062, + "loss": 0.9893, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.009516356512904167, + "rewards/margins": 0.007908456958830357, + "rewards/rejected": -0.0174248106777668, + "step": 800 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -2.672806978225708, + "eval_logits/rejected": -2.667616844177246, + "eval_logps/chosen": -195.71884155273438, + "eval_logps/rejected": -181.0857696533203, + "eval_loss": 0.9950501322746277, + "eval_rewards/accuracies": 0.5248130559921265, + "eval_rewards/chosen": -0.011351874098181725, + "eval_rewards/margins": 0.005094607826322317, + "eval_rewards/rejected": -0.01644648239016533, + "eval_runtime": 443.5917, + "eval_samples_per_second": 26.522, + "eval_steps_per_second": 3.316, + "step": 800 + }, + { + "epoch": 0.24, + "learning_rate": 3.935860058309038e-07, + "logits/chosen": -2.7565319538116455, + "logits/rejected": -2.772705554962158, + "logps/chosen": -175.65234375, + "logps/rejected": -171.27127075195312, + "loss": 0.9918, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.010509233921766281, + "rewards/margins": 0.008544095791876316, + "rewards/rejected": -0.019053328782320023, + "step": 810 + }, + { + "epoch": 0.24, + "learning_rate": 3.9844509232264333e-07, + "logits/chosen": -2.7437987327575684, + "logits/rejected": -2.760740280151367, + "logps/chosen": -180.13516235351562, + "logps/rejected": -173.41995239257812, + "loss": 0.9922, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.012019792571663857, + "rewards/margins": 0.0075903395190835, + "rewards/rejected": -0.01961013302206993, + "step": 820 + }, + { + "epoch": 0.24, + "learning_rate": 4.033041788143829e-07, + "logits/chosen": -2.754300117492676, + "logits/rejected": -2.754885196685791, + "logps/chosen": -198.19288635253906, + "logps/rejected": -176.8773651123047, + "loss": 0.9977, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.01481813658028841, + "rewards/margins": 0.0010406129295006394, + "rewards/rejected": -0.01585875079035759, + "step": 830 + }, + { + "epoch": 0.25, + "learning_rate": 4.0816326530612243e-07, + "logits/chosen": -2.742445230484009, + "logits/rejected": -2.761491298675537, + "logps/chosen": -180.978271484375, + "logps/rejected": -187.919677734375, + "loss": 0.9908, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.009284690022468567, + "rewards/margins": 0.009516666643321514, + "rewards/rejected": -0.018801355734467506, + "step": 840 + }, + { + "epoch": 0.25, + "learning_rate": 4.13022351797862e-07, + "logits/chosen": -2.7756142616271973, + "logits/rejected": -2.761234760284424, + "logps/chosen": -205.8830108642578, + "logps/rejected": -187.29832458496094, + "loss": 0.9929, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.015474356710910797, + "rewards/margins": 0.0059659467078745365, + "rewards/rejected": -0.021440301090478897, + "step": 850 + }, + { + "epoch": 0.25, + "learning_rate": 4.178814382896015e-07, + "logits/chosen": -2.7605714797973633, + "logits/rejected": -2.768223285675049, + "logps/chosen": -193.89329528808594, + "logps/rejected": -186.303955078125, + "loss": 0.9917, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.015928097069263458, + "rewards/margins": 0.00672167306765914, + "rewards/rejected": -0.022649768739938736, + "step": 860 + }, + { + "epoch": 0.25, + "learning_rate": 4.227405247813411e-07, + "logits/chosen": -2.753310441970825, + "logits/rejected": -2.7532548904418945, + "logps/chosen": -200.5699005126953, + "logps/rejected": -190.7694549560547, + "loss": 0.993, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.01844353973865509, + "rewards/margins": 0.0066141290590167046, + "rewards/rejected": -0.02505766786634922, + "step": 870 + }, + { + "epoch": 0.26, + "learning_rate": 4.275996112730806e-07, + "logits/chosen": -2.786644458770752, + "logits/rejected": -2.776003360748291, + "logps/chosen": -194.55020141601562, + "logps/rejected": -184.32540893554688, + "loss": 0.9896, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.013445606455206871, + "rewards/margins": 0.011986644938588142, + "rewards/rejected": -0.025432255119085312, + "step": 880 + }, + { + "epoch": 0.26, + "learning_rate": 4.324586977648202e-07, + "logits/chosen": -2.754173994064331, + "logits/rejected": -2.7404377460479736, + "logps/chosen": -206.9376983642578, + "logps/rejected": -178.90286254882812, + "loss": 0.9932, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.015037769451737404, + "rewards/margins": 0.006139338947832584, + "rewards/rejected": -0.021177105605602264, + "step": 890 + }, + { + "epoch": 0.26, + "learning_rate": 4.373177842565597e-07, + "logits/chosen": -2.8073904514312744, + "logits/rejected": -2.7982370853424072, + "logps/chosen": -201.30996704101562, + "logps/rejected": -180.21197509765625, + "loss": 0.988, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.005904694087803364, + "rewards/margins": 0.011741106398403645, + "rewards/rejected": -0.01764579862356186, + "step": 900 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.6813623905181885, + "eval_logits/rejected": -2.6763410568237305, + "eval_logps/chosen": -195.7743682861328, + "eval_logps/rejected": -181.16632080078125, + "eval_loss": 0.9923923015594482, + "eval_rewards/accuracies": 0.5421481728553772, + "eval_rewards/chosen": -0.016905097290873528, + "eval_rewards/margins": 0.007597530260682106, + "eval_rewards/rejected": -0.024502631276845932, + "eval_runtime": 443.5463, + "eval_samples_per_second": 26.525, + "eval_steps_per_second": 3.316, + "step": 900 + }, + { + "epoch": 0.27, + "learning_rate": 4.421768707482993e-07, + "logits/chosen": -2.7671663761138916, + "logits/rejected": -2.755331516265869, + "logps/chosen": -209.4878692626953, + "logps/rejected": -188.00961303710938, + "loss": 0.9968, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.019240865483880043, + "rewards/margins": -0.0007847605156712234, + "rewards/rejected": -0.018456105142831802, + "step": 910 + }, + { + "epoch": 0.27, + "learning_rate": 4.470359572400388e-07, + "logits/chosen": -2.7682743072509766, + "logits/rejected": -2.7791781425476074, + "logps/chosen": -197.57215881347656, + "logps/rejected": -187.62045288085938, + "loss": 0.9943, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.021333549171686172, + "rewards/margins": 0.005782320164144039, + "rewards/rejected": -0.027115870267152786, + "step": 920 + }, + { + "epoch": 0.27, + "learning_rate": 4.5189504373177837e-07, + "logits/chosen": -2.752847194671631, + "logits/rejected": -2.76066255569458, + "logps/chosen": -159.60269165039062, + "logps/rejected": -163.90760803222656, + "loss": 0.9949, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.017810573801398277, + "rewards/margins": 0.006514446344226599, + "rewards/rejected": -0.024325022473931313, + "step": 930 + }, + { + "epoch": 0.27, + "learning_rate": 4.567541302235179e-07, + "logits/chosen": -2.7474169731140137, + "logits/rejected": -2.760066509246826, + "logps/chosen": -190.1791534423828, + "logps/rejected": -181.58018493652344, + "loss": 0.9964, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.020396294072270393, + "rewards/margins": 0.001476737903431058, + "rewards/rejected": -0.021873032674193382, + "step": 940 + }, + { + "epoch": 0.28, + "learning_rate": 4.6161321671525747e-07, + "logits/chosen": -2.768737316131592, + "logits/rejected": -2.7523348331451416, + "logps/chosen": -204.35702514648438, + "logps/rejected": -174.8519287109375, + "loss": 0.9941, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.019180381670594215, + "rewards/margins": 0.002315108897164464, + "rewards/rejected": -0.02149549126625061, + "step": 950 + }, + { + "epoch": 0.28, + "learning_rate": 4.66472303206997e-07, + "logits/chosen": -2.756478786468506, + "logits/rejected": -2.737335443496704, + "logps/chosen": -204.95266723632812, + "logps/rejected": -176.6099090576172, + "loss": 0.9906, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.02747512422502041, + "rewards/margins": 0.0013099886709824204, + "rewards/rejected": -0.028785113245248795, + "step": 960 + }, + { + "epoch": 0.28, + "learning_rate": 4.7133138969873657e-07, + "logits/chosen": -2.793271780014038, + "logits/rejected": -2.7566933631896973, + "logps/chosen": -211.52218627929688, + "logps/rejected": -174.21127319335938, + "loss": 0.9904, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.0174846388399601, + "rewards/margins": 0.011544780805706978, + "rewards/rejected": -0.029029419645667076, + "step": 970 + }, + { + "epoch": 0.29, + "learning_rate": 4.761904761904761e-07, + "logits/chosen": -2.7469844818115234, + "logits/rejected": -2.754621982574463, + "logps/chosen": -196.5634002685547, + "logps/rejected": -180.66799926757812, + "loss": 0.9948, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.02481316402554512, + "rewards/margins": 0.004062502179294825, + "rewards/rejected": -0.02887566387653351, + "step": 980 + }, + { + "epoch": 0.29, + "learning_rate": 4.810495626822157e-07, + "logits/chosen": -2.7565131187438965, + "logits/rejected": -2.7494494915008545, + "logps/chosen": -210.0545196533203, + "logps/rejected": -196.57020568847656, + "loss": 0.9908, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.01970556564629078, + "rewards/margins": 0.010368332266807556, + "rewards/rejected": -0.030073896050453186, + "step": 990 + }, + { + "epoch": 0.29, + "learning_rate": 4.859086491739553e-07, + "logits/chosen": -2.799431800842285, + "logits/rejected": -2.784318447113037, + "logps/chosen": -223.51602172851562, + "logps/rejected": -199.84771728515625, + "loss": 0.9879, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.021287377923727036, + "rewards/margins": 0.012864580377936363, + "rewards/rejected": -0.03415196016430855, + "step": 1000 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.6796457767486572, + "eval_logits/rejected": -2.6746058464050293, + "eval_logps/chosen": -195.8248291015625, + "eval_logps/rejected": -181.2388458251953, + "eval_loss": 0.9906548857688904, + "eval_rewards/accuracies": 0.5475866794586182, + "eval_rewards/chosen": -0.02195058949291706, + "eval_rewards/margins": 0.009805315174162388, + "eval_rewards/rejected": -0.03175590559840202, + "eval_runtime": 443.5499, + "eval_samples_per_second": 26.525, + "eval_steps_per_second": 3.316, + "step": 1000 + }, + { + "epoch": 0.29, + "learning_rate": 4.907677356656948e-07, + "logits/chosen": -2.7441210746765137, + "logits/rejected": -2.77170467376709, + "logps/chosen": -180.41897583007812, + "logps/rejected": -174.46829223632812, + "loss": 0.9895, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.019895415753126144, + "rewards/margins": 0.00860919151455164, + "rewards/rejected": -0.02850460447371006, + "step": 1010 + }, + { + "epoch": 0.3, + "learning_rate": 4.956268221574344e-07, + "logits/chosen": -2.783003330230713, + "logits/rejected": -2.8088173866271973, + "logps/chosen": -196.19674682617188, + "logps/rejected": -198.5745391845703, + "loss": 0.9864, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.024192538112401962, + "rewards/margins": 0.017691707238554955, + "rewards/rejected": -0.041884247213602066, + "step": 1020 + }, + { + "epoch": 0.3, + "learning_rate": 4.999999855968691e-07, + "logits/chosen": -2.7660329341888428, + "logits/rejected": -2.7447166442871094, + "logps/chosen": -231.9657745361328, + "logps/rejected": -195.4416046142578, + "loss": 0.9941, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.027889683842658997, + "rewards/margins": 0.007279851473867893, + "rewards/rejected": -0.035169534385204315, + "step": 1030 + }, + { + "epoch": 0.3, + "learning_rate": 4.999982572231678e-07, + "logits/chosen": -2.7660796642303467, + "logits/rejected": -2.7666707038879395, + "logps/chosen": -184.3852996826172, + "logps/rejected": -176.80551147460938, + "loss": 0.992, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.023688379675149918, + "rewards/margins": 0.009250967763364315, + "rewards/rejected": -0.03293934464454651, + "step": 1040 + }, + { + "epoch": 0.31, + "learning_rate": 4.999936482461037e-07, + "logits/chosen": -2.759528160095215, + "logits/rejected": -2.7832791805267334, + "logps/chosen": -165.89126586914062, + "logps/rejected": -167.88478088378906, + "loss": 0.9909, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.019904453307390213, + "rewards/margins": 0.014452459290623665, + "rewards/rejected": -0.03435691073536873, + "step": 1050 + }, + { + "epoch": 0.31, + "learning_rate": 4.999861587187839e-07, + "logits/chosen": -2.7523863315582275, + "logits/rejected": -2.730530023574829, + "logps/chosen": -182.0239715576172, + "logps/rejected": -153.8279266357422, + "loss": 0.9911, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02682223916053772, + "rewards/margins": 0.008590024895966053, + "rewards/rejected": -0.0354122593998909, + "step": 1060 + }, + { + "epoch": 0.31, + "learning_rate": 4.999757887275061e-07, + "logits/chosen": -2.7594218254089355, + "logits/rejected": -2.75547456741333, + "logps/chosen": -200.0524139404297, + "logps/rejected": -185.8358612060547, + "loss": 0.991, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.027161534875631332, + "rewards/margins": 0.004652016330510378, + "rewards/rejected": -0.03181355446577072, + "step": 1070 + }, + { + "epoch": 0.32, + "learning_rate": 4.999625383917586e-07, + "logits/chosen": -2.747807741165161, + "logits/rejected": -2.7675487995147705, + "logps/chosen": -195.44952392578125, + "logps/rejected": -187.72470092773438, + "loss": 0.9884, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.02705690823495388, + "rewards/margins": 0.012591761536896229, + "rewards/rejected": -0.039648670703172684, + "step": 1080 + }, + { + "epoch": 0.32, + "learning_rate": 4.999464078642184e-07, + "logits/chosen": -2.7693164348602295, + "logits/rejected": -2.7496602535247803, + "logps/chosen": -198.76596069335938, + "logps/rejected": -169.71527099609375, + "loss": 0.992, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.027691161260008812, + "rewards/margins": 0.009209704585373402, + "rewards/rejected": -0.03690087050199509, + "step": 1090 + }, + { + "epoch": 0.32, + "learning_rate": 4.999273973307493e-07, + "logits/chosen": -2.764065980911255, + "logits/rejected": -2.7625510692596436, + "logps/chosen": -198.4870147705078, + "logps/rejected": -185.19239807128906, + "loss": 0.9882, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.02553977072238922, + "rewards/margins": 0.015339975245296955, + "rewards/rejected": -0.0408797450363636, + "step": 1100 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -2.669912815093994, + "eval_logits/rejected": -2.6647205352783203, + "eval_logps/chosen": -195.86611938476562, + "eval_logps/rejected": -181.32003784179688, + "eval_loss": 0.9869112372398376, + "eval_rewards/accuracies": 0.5598232746124268, + "eval_rewards/chosen": -0.026083307340741158, + "eval_rewards/margins": 0.013792227022349834, + "eval_rewards/rejected": -0.039875537157058716, + "eval_runtime": 443.4502, + "eval_samples_per_second": 26.531, + "eval_steps_per_second": 3.317, + "step": 1100 + }, + { + "epoch": 0.32, + "learning_rate": 4.999055070104e-07, + "logits/chosen": -2.764817714691162, + "logits/rejected": -2.7909953594207764, + "logps/chosen": -184.2716522216797, + "logps/rejected": -188.51341247558594, + "loss": 0.989, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.03175400570034981, + "rewards/margins": 0.015587709844112396, + "rewards/rejected": -0.047341711819171906, + "step": 1110 + }, + { + "epoch": 0.33, + "learning_rate": 4.998807371554017e-07, + "logits/chosen": -2.7454075813293457, + "logits/rejected": -2.748185396194458, + "logps/chosen": -189.30145263671875, + "logps/rejected": -183.9518280029297, + "loss": 0.9874, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.028897082433104515, + "rewards/margins": 0.016509678214788437, + "rewards/rejected": -0.045406755059957504, + "step": 1120 + }, + { + "epoch": 0.33, + "learning_rate": 4.998530880511649e-07, + "logits/chosen": -2.7354488372802734, + "logits/rejected": -2.7205042839050293, + "logps/chosen": -194.57437133789062, + "logps/rejected": -168.87326049804688, + "loss": 0.9852, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": -0.031062299385666847, + "rewards/margins": 0.014078726060688496, + "rewards/rejected": -0.04514102265238762, + "step": 1130 + }, + { + "epoch": 0.33, + "learning_rate": 4.998225600162761e-07, + "logits/chosen": -2.7497661113739014, + "logits/rejected": -2.765693426132202, + "logps/chosen": -183.91720581054688, + "logps/rejected": -189.841796875, + "loss": 0.9812, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.032630693167448044, + "rewards/margins": 0.017434656620025635, + "rewards/rejected": -0.05006534606218338, + "step": 1140 + }, + { + "epoch": 0.34, + "learning_rate": 4.997891534024945e-07, + "logits/chosen": -2.7627580165863037, + "logits/rejected": -2.7716870307922363, + "logps/chosen": -184.07809448242188, + "logps/rejected": -187.72488403320312, + "loss": 0.9814, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.0297432541847229, + "rewards/margins": 0.018422532826662064, + "rewards/rejected": -0.048165787011384964, + "step": 1150 + }, + { + "epoch": 0.34, + "learning_rate": 4.997528685947477e-07, + "logits/chosen": -2.781460762023926, + "logits/rejected": -2.7639803886413574, + "logps/chosen": -186.93238830566406, + "logps/rejected": -169.6712646484375, + "loss": 0.9864, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.031463898718357086, + "rewards/margins": 0.009577763266861439, + "rewards/rejected": -0.04104166105389595, + "step": 1160 + }, + { + "epoch": 0.34, + "learning_rate": 4.997137060111269e-07, + "logits/chosen": -2.729543447494507, + "logits/rejected": -2.760732889175415, + "logps/chosen": -170.96646118164062, + "logps/rejected": -179.94125366210938, + "loss": 0.9854, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03276781365275383, + "rewards/margins": 0.018309885635972023, + "rewards/rejected": -0.051077693700790405, + "step": 1170 + }, + { + "epoch": 0.34, + "learning_rate": 4.996716661028829e-07, + "logits/chosen": -2.753207206726074, + "logits/rejected": -2.7457656860351562, + "logps/chosen": -197.0430450439453, + "logps/rejected": -188.0629425048828, + "loss": 0.9888, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.04059111326932907, + "rewards/margins": 0.007697828114032745, + "rewards/rejected": -0.04828894883394241, + "step": 1180 + }, + { + "epoch": 0.35, + "learning_rate": 4.996267493544203e-07, + "logits/chosen": -2.805116653442383, + "logits/rejected": -2.80474853515625, + "logps/chosen": -195.44107055664062, + "logps/rejected": -189.65139770507812, + "loss": 0.989, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04101649299263954, + "rewards/margins": 0.009648426435887814, + "rewards/rejected": -0.05066491290926933, + "step": 1190 + }, + { + "epoch": 0.35, + "learning_rate": 4.99578956283292e-07, + "logits/chosen": -2.7761738300323486, + "logits/rejected": -2.792396306991577, + "logps/chosen": -202.1172332763672, + "logps/rejected": -195.01736450195312, + "loss": 0.979, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.028962809592485428, + "rewards/margins": 0.02408483624458313, + "rewards/rejected": -0.053047649562358856, + "step": 1200 + }, + { + "epoch": 0.35, + "eval_logits/chosen": -2.6734869480133057, + "eval_logits/rejected": -2.6683530807495117, + "eval_logps/chosen": -195.96934509277344, + "eval_logps/rejected": -181.4419403076172, + "eval_loss": 0.9851318597793579, + "eval_rewards/accuracies": 0.5562542676925659, + "eval_rewards/chosen": -0.0364016555249691, + "eval_rewards/margins": 0.015664031729102135, + "eval_rewards/rejected": -0.052065689116716385, + "eval_runtime": 443.4919, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 1200 + }, + { + "epoch": 0.35, + "learning_rate": 4.995282874401933e-07, + "logits/chosen": -2.790585994720459, + "logits/rejected": -2.7970879077911377, + "logps/chosen": -195.25216674804688, + "logps/rejected": -186.48037719726562, + "loss": 0.9838, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.033409614115953445, + "rewards/margins": 0.017419548705220222, + "rewards/rejected": -0.05082916095852852, + "step": 1210 + }, + { + "epoch": 0.36, + "learning_rate": 4.994747434089559e-07, + "logits/chosen": -2.771557331085205, + "logits/rejected": -2.754660129547119, + "logps/chosen": -200.41087341308594, + "logps/rejected": -188.08175659179688, + "loss": 0.9935, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.045268464833498, + "rewards/margins": 0.007542738225311041, + "rewards/rejected": -0.05281120538711548, + "step": 1220 + }, + { + "epoch": 0.36, + "learning_rate": 4.994183248065402e-07, + "logits/chosen": -2.7263119220733643, + "logits/rejected": -2.7358851432800293, + "logps/chosen": -195.21957397460938, + "logps/rejected": -181.23416137695312, + "loss": 0.9856, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.035321272909641266, + "rewards/margins": 0.01393541507422924, + "rewards/rejected": -0.049256689846515656, + "step": 1230 + }, + { + "epoch": 0.36, + "learning_rate": 4.993590322830295e-07, + "logits/chosen": -2.7576966285705566, + "logits/rejected": -2.754138946533203, + "logps/chosen": -201.56475830078125, + "logps/rejected": -175.058349609375, + "loss": 0.9867, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": -0.048004359006881714, + "rewards/margins": 0.007500568870455027, + "rewards/rejected": -0.055504925549030304, + "step": 1240 + }, + { + "epoch": 0.36, + "learning_rate": 4.992968665216213e-07, + "logits/chosen": -2.742941379547119, + "logits/rejected": -2.740309238433838, + "logps/chosen": -191.94155883789062, + "logps/rejected": -180.1873016357422, + "loss": 0.9865, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.03951577842235565, + "rewards/margins": 0.014310412108898163, + "rewards/rejected": -0.053826190531253815, + "step": 1250 + }, + { + "epoch": 0.37, + "learning_rate": 4.992318282386203e-07, + "logits/chosen": -2.796257495880127, + "logits/rejected": -2.7801544666290283, + "logps/chosen": -198.45379638671875, + "logps/rejected": -180.01675415039062, + "loss": 0.9877, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.043470896780490875, + "rewards/margins": 0.011671255342662334, + "rewards/rejected": -0.055142153054475784, + "step": 1260 + }, + { + "epoch": 0.37, + "learning_rate": 4.991639181834298e-07, + "logits/chosen": -2.7639927864074707, + "logits/rejected": -2.7846803665161133, + "logps/chosen": -188.55496215820312, + "logps/rejected": -180.0243377685547, + "loss": 0.9848, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04286808520555496, + "rewards/margins": 0.01618005894124508, + "rewards/rejected": -0.05904814600944519, + "step": 1270 + }, + { + "epoch": 0.37, + "learning_rate": 4.990931371385427e-07, + "logits/chosen": -2.751796245574951, + "logits/rejected": -2.749959707260132, + "logps/chosen": -171.72152709960938, + "logps/rejected": -165.4966583251953, + "loss": 0.9802, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.03991890698671341, + "rewards/margins": 0.01386339496821165, + "rewards/rejected": -0.053782302886247635, + "step": 1280 + }, + { + "epoch": 0.38, + "learning_rate": 4.990194859195335e-07, + "logits/chosen": -2.8110146522521973, + "logits/rejected": -2.799238681793213, + "logps/chosen": -190.9245147705078, + "logps/rejected": -185.66787719726562, + "loss": 0.9916, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.046778354793787, + "rewards/margins": 0.009815091267228127, + "rewards/rejected": -0.05659344792366028, + "step": 1290 + }, + { + "epoch": 0.38, + "learning_rate": 4.989429653750478e-07, + "logits/chosen": -2.7405948638916016, + "logits/rejected": -2.764275550842285, + "logps/chosen": -186.57777404785156, + "logps/rejected": -181.9051055908203, + "loss": 0.985, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.04000754654407501, + "rewards/margins": 0.023956280201673508, + "rewards/rejected": -0.06396382302045822, + "step": 1300 + }, + { + "epoch": 0.38, + "eval_logits/chosen": -2.692143678665161, + "eval_logits/rejected": -2.687408447265625, + "eval_logps/chosen": -195.9900360107422, + "eval_logps/rejected": -181.49777221679688, + "eval_loss": 0.9818428158760071, + "eval_rewards/accuracies": 0.5608429908752441, + "eval_rewards/chosen": -0.038472600281238556, + "eval_rewards/margins": 0.01917639747262001, + "eval_rewards/rejected": -0.057648997753858566, + "eval_runtime": 443.4605, + "eval_samples_per_second": 26.53, + "eval_steps_per_second": 3.317, + "step": 1300 + }, + { + "epoch": 0.38, + "learning_rate": 4.988635763867929e-07, + "logits/chosen": -2.754769802093506, + "logits/rejected": -2.7642273902893066, + "logps/chosen": -171.62371826171875, + "logps/rejected": -173.17213439941406, + "loss": 0.9828, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.0436868891119957, + "rewards/margins": 0.013914274051785469, + "rewards/rejected": -0.05760116130113602, + "step": 1310 + }, + { + "epoch": 0.39, + "learning_rate": 4.987813198695282e-07, + "logits/chosen": -2.799032688140869, + "logits/rejected": -2.7754616737365723, + "logps/chosen": -239.6453399658203, + "logps/rejected": -194.0479278564453, + "loss": 0.9799, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": -0.042483363300561905, + "rewards/margins": 0.020803894847631454, + "rewards/rejected": -0.06328727304935455, + "step": 1320 + }, + { + "epoch": 0.39, + "learning_rate": 4.986961967710538e-07, + "logits/chosen": -2.751368761062622, + "logits/rejected": -2.775388717651367, + "logps/chosen": -190.04867553710938, + "logps/rejected": -184.562255859375, + "loss": 0.9826, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.04249824956059456, + "rewards/margins": 0.01044423133134842, + "rewards/rejected": -0.05294247716665268, + "step": 1330 + }, + { + "epoch": 0.39, + "learning_rate": 4.986082080722e-07, + "logits/chosen": -2.718585729598999, + "logits/rejected": -2.753647804260254, + "logps/chosen": -155.80125427246094, + "logps/rejected": -167.8536834716797, + "loss": 0.9767, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.041705384850502014, + "rewards/margins": 0.018204566091299057, + "rewards/rejected": -0.05990995094180107, + "step": 1340 + }, + { + "epoch": 0.39, + "learning_rate": 4.985173547868161e-07, + "logits/chosen": -2.782245397567749, + "logits/rejected": -2.7602505683898926, + "logps/chosen": -193.46780395507812, + "logps/rejected": -168.34664916992188, + "loss": 0.9872, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.05123595520853996, + "rewards/margins": 0.011545160785317421, + "rewards/rejected": -0.06278111785650253, + "step": 1350 + }, + { + "epoch": 0.4, + "learning_rate": 4.984236379617585e-07, + "logits/chosen": -2.762777805328369, + "logits/rejected": -2.7644848823547363, + "logps/chosen": -191.2324676513672, + "logps/rejected": -175.95669555664062, + "loss": 0.9746, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.041074853390455246, + "rewards/margins": 0.027508120983839035, + "rewards/rejected": -0.06858296692371368, + "step": 1360 + }, + { + "epoch": 0.4, + "learning_rate": 4.983270586768788e-07, + "logits/chosen": -2.7665207386016846, + "logits/rejected": -2.7342450618743896, + "logps/chosen": -208.2240753173828, + "logps/rejected": -171.34295654296875, + "loss": 0.9826, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.04937398433685303, + "rewards/margins": 0.010283073410391808, + "rewards/rejected": -0.059657059609889984, + "step": 1370 + }, + { + "epoch": 0.4, + "learning_rate": 4.982276180450112e-07, + "logits/chosen": -2.771615505218506, + "logits/rejected": -2.75608491897583, + "logps/chosen": -190.74603271484375, + "logps/rejected": -178.79782104492188, + "loss": 0.9848, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.046520378440618515, + "rewards/margins": 0.014152820222079754, + "rewards/rejected": -0.060673199594020844, + "step": 1380 + }, + { + "epoch": 0.41, + "learning_rate": 4.981253172119596e-07, + "logits/chosen": -2.7729361057281494, + "logits/rejected": -2.775089740753174, + "logps/chosen": -183.19204711914062, + "logps/rejected": -174.96011352539062, + "loss": 0.9854, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.05265643075108528, + "rewards/margins": 0.012122412212193012, + "rewards/rejected": -0.06477884203195572, + "step": 1390 + }, + { + "epoch": 0.41, + "learning_rate": 4.980201573564849e-07, + "logits/chosen": -2.7582387924194336, + "logits/rejected": -2.7580008506774902, + "logps/chosen": -186.2787322998047, + "logps/rejected": -192.34915161132812, + "loss": 0.9821, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.05323047563433647, + "rewards/margins": 0.02109154500067234, + "rewards/rejected": -0.07432201504707336, + "step": 1400 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.6810476779937744, + "eval_logits/rejected": -2.676079511642456, + "eval_logps/chosen": -196.06719970703125, + "eval_logps/rejected": -181.589111328125, + "eval_loss": 0.98047935962677, + "eval_rewards/accuracies": 0.558973491191864, + "eval_rewards/chosen": -0.04618801176548004, + "eval_rewards/margins": 0.020594673231244087, + "eval_rewards/rejected": -0.06678267568349838, + "eval_runtime": 443.5004, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 1400 + }, + { + "epoch": 0.41, + "learning_rate": 4.979121396902908e-07, + "logits/chosen": -2.7379727363586426, + "logits/rejected": -2.7336249351501465, + "logps/chosen": -203.15322875976562, + "logps/rejected": -181.72555541992188, + "loss": 0.9673, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.050723861902952194, + "rewards/margins": 0.030538350343704224, + "rewards/rejected": -0.08126221597194672, + "step": 1410 + }, + { + "epoch": 0.41, + "learning_rate": 4.978012654580102e-07, + "logits/chosen": -2.766727924346924, + "logits/rejected": -2.7514896392822266, + "logps/chosen": -203.69638061523438, + "logps/rejected": -176.80108642578125, + "loss": 0.9777, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.04542822390794754, + "rewards/margins": 0.02188730053603649, + "rewards/rejected": -0.06731553375720978, + "step": 1420 + }, + { + "epoch": 0.42, + "learning_rate": 4.976875359371907e-07, + "logits/chosen": -2.730456590652466, + "logits/rejected": -2.741044759750366, + "logps/chosen": -170.23345947265625, + "logps/rejected": -165.81503295898438, + "loss": 0.9798, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.05172818899154663, + "rewards/margins": 0.020575672388076782, + "rewards/rejected": -0.07230386137962341, + "step": 1430 + }, + { + "epoch": 0.42, + "learning_rate": 4.9757095243828e-07, + "logits/chosen": -2.7576966285705566, + "logits/rejected": -2.767139434814453, + "logps/chosen": -184.7630615234375, + "logps/rejected": -174.48568725585938, + "loss": 0.9791, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.048875968903303146, + "rewards/margins": 0.019326506182551384, + "rewards/rejected": -0.06820248067378998, + "step": 1440 + }, + { + "epoch": 0.42, + "learning_rate": 4.974515163046109e-07, + "logits/chosen": -2.755999803543091, + "logits/rejected": -2.7733826637268066, + "logps/chosen": -208.53012084960938, + "logps/rejected": -187.8560333251953, + "loss": 0.9791, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05815325304865837, + "rewards/margins": 0.02635633386671543, + "rewards/rejected": -0.08450958877801895, + "step": 1450 + }, + { + "epoch": 0.43, + "learning_rate": 4.973292289123853e-07, + "logits/chosen": -2.7847931385040283, + "logits/rejected": -2.7798149585723877, + "logps/chosen": -192.0679168701172, + "logps/rejected": -189.32138061523438, + "loss": 0.9796, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.045699235051870346, + "rewards/margins": 0.02493266388773918, + "rewards/rejected": -0.07063189893960953, + "step": 1460 + }, + { + "epoch": 0.43, + "learning_rate": 4.972040916706591e-07, + "logits/chosen": -2.7773451805114746, + "logits/rejected": -2.7763779163360596, + "logps/chosen": -181.1774139404297, + "logps/rejected": -177.46780395507812, + "loss": 0.9744, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.051887236535549164, + "rewards/margins": 0.019978413358330727, + "rewards/rejected": -0.07186565548181534, + "step": 1470 + }, + { + "epoch": 0.43, + "learning_rate": 4.970761060213252e-07, + "logits/chosen": -2.7489306926727295, + "logits/rejected": -2.7517669200897217, + "logps/chosen": -191.70501708984375, + "logps/rejected": -186.7254180908203, + "loss": 0.9782, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.053087152540683746, + "rewards/margins": 0.023641914129257202, + "rewards/rejected": -0.07672907412052155, + "step": 1480 + }, + { + "epoch": 0.43, + "learning_rate": 4.969452734390976e-07, + "logits/chosen": -2.7764415740966797, + "logits/rejected": -2.740410566329956, + "logps/chosen": -223.160400390625, + "logps/rejected": -195.7806854248047, + "loss": 0.9679, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.05532132834196091, + "rewards/margins": 0.03490416705608368, + "rewards/rejected": -0.09022549539804459, + "step": 1490 + }, + { + "epoch": 0.44, + "learning_rate": 4.968115954314938e-07, + "logits/chosen": -2.7740657329559326, + "logits/rejected": -2.74410343170166, + "logps/chosen": -209.5076141357422, + "logps/rejected": -175.87014770507812, + "loss": 0.9822, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.06310389190912247, + "rewards/margins": 0.008666287176311016, + "rewards/rejected": -0.07177017629146576, + "step": 1500 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -2.6813409328460693, + "eval_logits/rejected": -2.6763975620269775, + "eval_logps/chosen": -196.1554412841797, + "eval_logps/rejected": -181.69834899902344, + "eval_loss": 0.9778910279273987, + "eval_rewards/accuracies": 0.5632222890853882, + "eval_rewards/chosen": -0.05501263216137886, + "eval_rewards/margins": 0.02269531972706318, + "eval_rewards/rejected": -0.07770795375108719, + "eval_runtime": 443.474, + "eval_samples_per_second": 26.529, + "eval_steps_per_second": 3.317, + "step": 1500 + }, + { + "epoch": 0.44, + "learning_rate": 4.966750735388179e-07, + "logits/chosen": -2.7591936588287354, + "logits/rejected": -2.750657558441162, + "logps/chosen": -192.50819396972656, + "logps/rejected": -178.97280883789062, + "loss": 0.9742, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06468276679515839, + "rewards/margins": 0.017507528886198997, + "rewards/rejected": -0.08219029009342194, + "step": 1510 + }, + { + "epoch": 0.44, + "learning_rate": 4.965357093341425e-07, + "logits/chosen": -2.7657759189605713, + "logits/rejected": -2.724945306777954, + "logps/chosen": -214.63888549804688, + "logps/rejected": -174.35433959960938, + "loss": 0.9775, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.055322058498859406, + "rewards/margins": 0.0185992531478405, + "rewards/rejected": -0.0739213079214096, + "step": 1520 + }, + { + "epoch": 0.45, + "learning_rate": 4.963935044232909e-07, + "logits/chosen": -2.7615058422088623, + "logits/rejected": -2.72468900680542, + "logps/chosen": -201.26620483398438, + "logps/rejected": -174.67568969726562, + "loss": 0.9842, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": -0.05784177780151367, + "rewards/margins": 0.015006395988166332, + "rewards/rejected": -0.07284817099571228, + "step": 1530 + }, + { + "epoch": 0.45, + "learning_rate": 4.96248460444818e-07, + "logits/chosen": -2.7774100303649902, + "logits/rejected": -2.782388210296631, + "logps/chosen": -174.90695190429688, + "logps/rejected": -176.49539184570312, + "loss": 0.9718, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.050717033445835114, + "rewards/margins": 0.02875804901123047, + "rewards/rejected": -0.07947508245706558, + "step": 1540 + }, + { + "epoch": 0.45, + "learning_rate": 4.961005790699925e-07, + "logits/chosen": -2.7708535194396973, + "logits/rejected": -2.760714054107666, + "logps/chosen": -197.136962890625, + "logps/rejected": -178.37271118164062, + "loss": 0.9687, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04507363587617874, + "rewards/margins": 0.033439092338085175, + "rewards/rejected": -0.07851273566484451, + "step": 1550 + }, + { + "epoch": 0.46, + "learning_rate": 4.959498620027765e-07, + "logits/chosen": -2.738436222076416, + "logits/rejected": -2.746307849884033, + "logps/chosen": -203.83351135253906, + "logps/rejected": -183.6167755126953, + "loss": 0.9772, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.07137466967105865, + "rewards/margins": 0.01522110402584076, + "rewards/rejected": -0.08659576624631882, + "step": 1560 + }, + { + "epoch": 0.46, + "learning_rate": 4.957963109798064e-07, + "logits/chosen": -2.7385504245758057, + "logits/rejected": -2.754218816757202, + "logps/chosen": -183.70143127441406, + "logps/rejected": -180.86209106445312, + "loss": 0.9792, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.06631606817245483, + "rewards/margins": 0.027938008308410645, + "rewards/rejected": -0.09425408393144608, + "step": 1570 + }, + { + "epoch": 0.46, + "learning_rate": 4.956399277703729e-07, + "logits/chosen": -2.732557535171509, + "logits/rejected": -2.718478202819824, + "logps/chosen": -201.7747802734375, + "logps/rejected": -166.88467407226562, + "loss": 0.9817, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.06500306725502014, + "rewards/margins": 0.017368188127875328, + "rewards/rejected": -0.08237125724554062, + "step": 1580 + }, + { + "epoch": 0.46, + "learning_rate": 4.954807141764006e-07, + "logits/chosen": -2.7428674697875977, + "logits/rejected": -2.744955062866211, + "logps/chosen": -192.12753295898438, + "logps/rejected": -179.4935760498047, + "loss": 0.9814, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0626605898141861, + "rewards/margins": 0.025600705295801163, + "rewards/rejected": -0.08826129138469696, + "step": 1590 + }, + { + "epoch": 0.47, + "learning_rate": 4.953186720324272e-07, + "logits/chosen": -2.78163480758667, + "logits/rejected": -2.754462718963623, + "logps/chosen": -207.11032104492188, + "logps/rejected": -178.58656311035156, + "loss": 0.9755, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.05041077733039856, + "rewards/margins": 0.030925389379262924, + "rewards/rejected": -0.08133616298437119, + "step": 1600 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -2.65568208694458, + "eval_logits/rejected": -2.6502463817596436, + "eval_logps/chosen": -196.205810546875, + "eval_logps/rejected": -181.77638244628906, + "eval_loss": 0.9756138920783997, + "eval_rewards/accuracies": 0.565601646900177, + "eval_rewards/chosen": -0.06004924699664116, + "eval_rewards/margins": 0.02545936405658722, + "eval_rewards/rejected": -0.08550861477851868, + "eval_runtime": 443.4965, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 1600 + }, + { + "epoch": 0.47, + "learning_rate": 4.951538032055822e-07, + "logits/chosen": -2.7657768726348877, + "logits/rejected": -2.76108980178833, + "logps/chosen": -204.87576293945312, + "logps/rejected": -192.89125061035156, + "loss": 0.9764, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.05869390815496445, + "rewards/margins": 0.03274776414036751, + "rewards/rejected": -0.09144166857004166, + "step": 1610 + }, + { + "epoch": 0.47, + "learning_rate": 4.949861095955656e-07, + "logits/chosen": -2.7711009979248047, + "logits/rejected": -2.7802529335021973, + "logps/chosen": -198.54647827148438, + "logps/rejected": -204.2147216796875, + "loss": 0.9831, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.07690480351448059, + "rewards/margins": 0.008501519449055195, + "rewards/rejected": -0.08540631830692291, + "step": 1620 + }, + { + "epoch": 0.48, + "learning_rate": 4.948155931346262e-07, + "logits/chosen": -2.752509117126465, + "logits/rejected": -2.750129222869873, + "logps/chosen": -204.9178009033203, + "logps/rejected": -195.44839477539062, + "loss": 0.9764, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.06369360536336899, + "rewards/margins": 0.022426560521125793, + "rewards/rejected": -0.08612016588449478, + "step": 1630 + }, + { + "epoch": 0.48, + "learning_rate": 4.946422557875386e-07, + "logits/chosen": -2.765320062637329, + "logits/rejected": -2.7900614738464355, + "logps/chosen": -183.15676879882812, + "logps/rejected": -191.41493225097656, + "loss": 0.9725, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.06169677525758743, + "rewards/margins": 0.025588322430849075, + "rewards/rejected": -0.0872851014137268, + "step": 1640 + }, + { + "epoch": 0.48, + "learning_rate": 4.944660995515814e-07, + "logits/chosen": -2.745473623275757, + "logits/rejected": -2.7588276863098145, + "logps/chosen": -195.170166015625, + "logps/rejected": -193.9084014892578, + "loss": 0.9754, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.07741503417491913, + "rewards/margins": 0.019316289573907852, + "rewards/rejected": -0.09673131257295609, + "step": 1650 + }, + { + "epoch": 0.48, + "learning_rate": 4.942871264565139e-07, + "logits/chosen": -2.734675407409668, + "logits/rejected": -2.739234447479248, + "logps/chosen": -188.9669647216797, + "logps/rejected": -180.78250122070312, + "loss": 0.9774, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.06605865061283112, + "rewards/margins": 0.022786004468798637, + "rewards/rejected": -0.0888446569442749, + "step": 1660 + }, + { + "epoch": 0.49, + "learning_rate": 4.941053385645522e-07, + "logits/chosen": -2.7543911933898926, + "logits/rejected": -2.7284069061279297, + "logps/chosen": -200.8400421142578, + "logps/rejected": -168.43988037109375, + "loss": 0.9813, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.0760786384344101, + "rewards/margins": 0.01815926656126976, + "rewards/rejected": -0.09423790872097015, + "step": 1670 + }, + { + "epoch": 0.49, + "learning_rate": 4.939207379703462e-07, + "logits/chosen": -2.7847373485565186, + "logits/rejected": -2.7574844360351562, + "logps/chosen": -199.1935272216797, + "logps/rejected": -177.27557373046875, + "loss": 0.971, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.06233362480998039, + "rewards/margins": 0.026879101991653442, + "rewards/rejected": -0.08921272307634354, + "step": 1680 + }, + { + "epoch": 0.49, + "learning_rate": 4.937333268009552e-07, + "logits/chosen": -2.748115062713623, + "logits/rejected": -2.726240873336792, + "logps/chosen": -204.0299835205078, + "logps/rejected": -180.19448852539062, + "loss": 0.9743, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.05886361747980118, + "rewards/margins": 0.024648474529385567, + "rewards/rejected": -0.0835120901465416, + "step": 1690 + }, + { + "epoch": 0.5, + "learning_rate": 4.935431072158234e-07, + "logits/chosen": -2.7623496055603027, + "logits/rejected": -2.7553372383117676, + "logps/chosen": -189.81854248046875, + "logps/rejected": -180.30911254882812, + "loss": 0.9697, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07516200840473175, + "rewards/margins": 0.023591557517647743, + "rewards/rejected": -0.09875356405973434, + "step": 1700 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.680124044418335, + "eval_logits/rejected": -2.675168037414551, + "eval_logps/chosen": -196.2568817138672, + "eval_logps/rejected": -181.85256958007812, + "eval_loss": 0.9731392860412598, + "eval_rewards/accuracies": 0.5650917887687683, + "eval_rewards/chosen": -0.06515874713659286, + "eval_rewards/margins": 0.02797050215303898, + "eval_rewards/rejected": -0.09312925487756729, + "eval_runtime": 443.319, + "eval_samples_per_second": 26.538, + "eval_steps_per_second": 3.318, + "step": 1700 + }, + { + "epoch": 0.5, + "learning_rate": 4.933500814067543e-07, + "logits/chosen": -2.7594799995422363, + "logits/rejected": -2.7589800357818604, + "logps/chosen": -212.92599487304688, + "logps/rejected": -188.29454040527344, + "loss": 0.96, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05867772176861763, + "rewards/margins": 0.041473548859357834, + "rewards/rejected": -0.10015126317739487, + "step": 1710 + }, + { + "epoch": 0.5, + "learning_rate": 4.931542515978871e-07, + "logits/chosen": -2.7504467964172363, + "logits/rejected": -2.756544828414917, + "logps/chosen": -212.72036743164062, + "logps/rejected": -204.91036987304688, + "loss": 0.9724, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07226310670375824, + "rewards/margins": 0.038937319070100784, + "rewards/rejected": -0.11120043694972992, + "step": 1720 + }, + { + "epoch": 0.5, + "learning_rate": 4.929556200456692e-07, + "logits/chosen": -2.745439052581787, + "logits/rejected": -2.7529444694519043, + "logps/chosen": -170.9637908935547, + "logps/rejected": -165.9134521484375, + "loss": 0.9841, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.06306852400302887, + "rewards/margins": 0.024060754105448723, + "rewards/rejected": -0.08712927997112274, + "step": 1730 + }, + { + "epoch": 0.51, + "learning_rate": 4.927541890388315e-07, + "logits/chosen": -2.75010085105896, + "logits/rejected": -2.741394519805908, + "logps/chosen": -213.6536102294922, + "logps/rejected": -198.53201293945312, + "loss": 0.9682, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.0757746696472168, + "rewards/margins": 0.02875874564051628, + "rewards/rejected": -0.10453341156244278, + "step": 1740 + }, + { + "epoch": 0.51, + "learning_rate": 4.925499608983617e-07, + "logits/chosen": -2.7459850311279297, + "logits/rejected": -2.754770040512085, + "logps/chosen": -208.39181518554688, + "logps/rejected": -188.983642578125, + "loss": 0.9612, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.0683453232049942, + "rewards/margins": 0.03796737641096115, + "rewards/rejected": -0.10631269216537476, + "step": 1750 + }, + { + "epoch": 0.51, + "learning_rate": 4.92342937977477e-07, + "logits/chosen": -2.7780871391296387, + "logits/rejected": -2.769228458404541, + "logps/chosen": -236.0298309326172, + "logps/rejected": -210.1380157470703, + "loss": 0.9676, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.07248321920633316, + "rewards/margins": 0.0332036130130291, + "rewards/rejected": -0.10568682849407196, + "step": 1760 + }, + { + "epoch": 0.52, + "learning_rate": 4.921331226615981e-07, + "logits/chosen": -2.7420806884765625, + "logits/rejected": -2.7397849559783936, + "logps/chosen": -196.43295288085938, + "logps/rejected": -178.20077514648438, + "loss": 0.9717, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.07191386073827744, + "rewards/margins": 0.029133638367056847, + "rewards/rejected": -0.10104749351739883, + "step": 1770 + }, + { + "epoch": 0.52, + "learning_rate": 4.919205173683202e-07, + "logits/chosen": -2.7878201007843018, + "logits/rejected": -2.7463841438293457, + "logps/chosen": -224.4694366455078, + "logps/rejected": -186.58062744140625, + "loss": 0.9756, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.06282098591327667, + "rewards/margins": 0.02961428463459015, + "rewards/rejected": -0.09243526309728622, + "step": 1780 + }, + { + "epoch": 0.52, + "learning_rate": 4.917051245473868e-07, + "logits/chosen": -2.756880283355713, + "logits/rejected": -2.7773396968841553, + "logps/chosen": -186.11907958984375, + "logps/rejected": -195.0716094970703, + "loss": 0.9748, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.07742653787136078, + "rewards/margins": 0.024380305781960487, + "rewards/rejected": -0.10180683434009552, + "step": 1790 + }, + { + "epoch": 0.53, + "learning_rate": 4.914869466806603e-07, + "logits/chosen": -2.7491137981414795, + "logits/rejected": -2.757794141769409, + "logps/chosen": -178.83447265625, + "logps/rejected": -172.74996948242188, + "loss": 0.969, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.07571407407522202, + "rewards/margins": 0.029796475544571877, + "rewards/rejected": -0.10551054775714874, + "step": 1800 + }, + { + "epoch": 0.53, + "eval_logits/chosen": -2.6686718463897705, + "eval_logits/rejected": -2.6634953022003174, + "eval_logps/chosen": -196.30665588378906, + "eval_logps/rejected": -181.93798828125, + "eval_loss": 0.9698388576507568, + "eval_rewards/accuracies": 0.5686607956886292, + "eval_rewards/chosen": -0.07013525068759918, + "eval_rewards/margins": 0.03153569623827934, + "eval_rewards/rejected": -0.10167094320058823, + "eval_runtime": 443.4506, + "eval_samples_per_second": 26.531, + "eval_steps_per_second": 3.317, + "step": 1800 + }, + { + "epoch": 0.53, + "learning_rate": 4.912659862820937e-07, + "logits/chosen": -2.7059569358825684, + "logits/rejected": -2.734696865081787, + "logps/chosen": -178.76712036132812, + "logps/rejected": -186.81173706054688, + "loss": 0.9687, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.08366887271404266, + "rewards/margins": 0.02821827493607998, + "rewards/rejected": -0.1118871420621872, + "step": 1810 + }, + { + "epoch": 0.53, + "learning_rate": 4.910422458977018e-07, + "logits/chosen": -2.7654166221618652, + "logits/rejected": -2.7601962089538574, + "logps/chosen": -189.85423278808594, + "logps/rejected": -185.08970642089844, + "loss": 0.9745, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": -0.07412820309400558, + "rewards/margins": 0.026077458634972572, + "rewards/rejected": -0.1002056747674942, + "step": 1820 + }, + { + "epoch": 0.53, + "learning_rate": 4.90815728105532e-07, + "logits/chosen": -2.7707886695861816, + "logits/rejected": -2.7747719287872314, + "logps/chosen": -186.23532104492188, + "logps/rejected": -183.20777893066406, + "loss": 0.9594, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.06935007870197296, + "rewards/margins": 0.030855247750878334, + "rewards/rejected": -0.10020531713962555, + "step": 1830 + }, + { + "epoch": 0.54, + "learning_rate": 4.90586435515634e-07, + "logits/chosen": -2.7859041690826416, + "logits/rejected": -2.7700035572052, + "logps/chosen": -213.6102752685547, + "logps/rejected": -195.71310424804688, + "loss": 0.9682, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07400950789451599, + "rewards/margins": 0.03597418963909149, + "rewards/rejected": -0.10998369753360748, + "step": 1840 + }, + { + "epoch": 0.54, + "learning_rate": 4.903543707700302e-07, + "logits/chosen": -2.761504650115967, + "logits/rejected": -2.7782986164093018, + "logps/chosen": -198.09092712402344, + "logps/rejected": -186.11976623535156, + "loss": 0.9752, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.07849571853876114, + "rewards/margins": 0.02638430893421173, + "rewards/rejected": -0.10488002002239227, + "step": 1850 + }, + { + "epoch": 0.54, + "learning_rate": 4.901195365426851e-07, + "logits/chosen": -2.744083881378174, + "logits/rejected": -2.745847225189209, + "logps/chosen": -196.6710662841797, + "logps/rejected": -183.01327514648438, + "loss": 0.9633, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.06702904403209686, + "rewards/margins": 0.0462227538228035, + "rewards/rejected": -0.11325179040431976, + "step": 1860 + }, + { + "epoch": 0.55, + "learning_rate": 4.89881935539475e-07, + "logits/chosen": -2.761718273162842, + "logits/rejected": -2.7792365550994873, + "logps/chosen": -204.8824005126953, + "logps/rejected": -189.04917907714844, + "loss": 0.9684, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07677263021469116, + "rewards/margins": 0.03511573746800423, + "rewards/rejected": -0.11188837140798569, + "step": 1870 + }, + { + "epoch": 0.55, + "learning_rate": 4.896415704981556e-07, + "logits/chosen": -2.7535223960876465, + "logits/rejected": -2.7572989463806152, + "logps/chosen": -205.97634887695312, + "logps/rejected": -198.9563751220703, + "loss": 0.9586, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.06817978620529175, + "rewards/margins": 0.04087362438440323, + "rewards/rejected": -0.10905341804027557, + "step": 1880 + }, + { + "epoch": 0.55, + "learning_rate": 4.893984441883317e-07, + "logits/chosen": -2.737229347229004, + "logits/rejected": -2.7194113731384277, + "logps/chosen": -184.98281860351562, + "logps/rejected": -170.07290649414062, + "loss": 0.9744, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.075322724878788, + "rewards/margins": 0.025637894868850708, + "rewards/rejected": -0.1009606122970581, + "step": 1890 + }, + { + "epoch": 0.55, + "learning_rate": 4.891525594114248e-07, + "logits/chosen": -2.759535789489746, + "logits/rejected": -2.748884677886963, + "logps/chosen": -206.9478759765625, + "logps/rejected": -191.83497619628906, + "loss": 0.9643, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.08102191984653473, + "rewards/margins": 0.0340164490044117, + "rewards/rejected": -0.11503837257623672, + "step": 1900 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -2.6642489433288574, + "eval_logits/rejected": -2.659011125564575, + "eval_logps/chosen": -196.3668670654297, + "eval_logps/rejected": -182.01368713378906, + "eval_loss": 0.9684566855430603, + "eval_rewards/accuracies": 0.5676410794258118, + "eval_rewards/chosen": -0.07615655660629272, + "eval_rewards/margins": 0.03308200463652611, + "eval_rewards/rejected": -0.10923856496810913, + "eval_runtime": 443.3461, + "eval_samples_per_second": 26.537, + "eval_steps_per_second": 3.318, + "step": 1900 + }, + { + "epoch": 0.56, + "learning_rate": 4.889039190006407e-07, + "logits/chosen": -2.7680182456970215, + "logits/rejected": -2.7183163166046143, + "logps/chosen": -196.04763793945312, + "logps/rejected": -171.93922424316406, + "loss": 0.9648, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.073929563164711, + "rewards/margins": 0.03020293638110161, + "rewards/rejected": -0.10413248836994171, + "step": 1910 + }, + { + "epoch": 0.56, + "learning_rate": 4.886525258209368e-07, + "logits/chosen": -2.789874315261841, + "logits/rejected": -2.7898380756378174, + "logps/chosen": -203.37677001953125, + "logps/rejected": -184.8866729736328, + "loss": 0.9636, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.08382896333932877, + "rewards/margins": 0.03888505697250366, + "rewards/rejected": -0.12271402031183243, + "step": 1920 + }, + { + "epoch": 0.56, + "learning_rate": 4.883983827689896e-07, + "logits/chosen": -2.7781100273132324, + "logits/rejected": -2.7623000144958496, + "logps/chosen": -224.9477081298828, + "logps/rejected": -200.4647674560547, + "loss": 0.9727, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.07715074717998505, + "rewards/margins": 0.026067961007356644, + "rewards/rejected": -0.1032186970114708, + "step": 1930 + }, + { + "epoch": 0.57, + "learning_rate": 4.881414927731608e-07, + "logits/chosen": -2.766394853591919, + "logits/rejected": -2.778665781021118, + "logps/chosen": -213.1951446533203, + "logps/rejected": -201.0144500732422, + "loss": 0.9584, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.08078450709581375, + "rewards/margins": 0.035427432507276535, + "rewards/rejected": -0.11621192842721939, + "step": 1940 + }, + { + "epoch": 0.57, + "learning_rate": 4.878818587934638e-07, + "logits/chosen": -2.7382991313934326, + "logits/rejected": -2.7401552200317383, + "logps/chosen": -182.39071655273438, + "logps/rejected": -173.4800567626953, + "loss": 0.9522, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.06153035908937454, + "rewards/margins": 0.051566898822784424, + "rewards/rejected": -0.11309723556041718, + "step": 1950 + }, + { + "epoch": 0.57, + "learning_rate": 4.876194838215295e-07, + "logits/chosen": -2.759721040725708, + "logits/rejected": -2.756985902786255, + "logps/chosen": -179.3950653076172, + "logps/rejected": -173.28369140625, + "loss": 0.9635, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0704420655965805, + "rewards/margins": 0.03925835341215134, + "rewards/rejected": -0.10970041900873184, + "step": 1960 + }, + { + "epoch": 0.57, + "learning_rate": 4.873543708805718e-07, + "logits/chosen": -2.767444133758545, + "logits/rejected": -2.728198766708374, + "logps/chosen": -218.9180145263672, + "logps/rejected": -189.20535278320312, + "loss": 0.9681, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.087130606174469, + "rewards/margins": 0.035440556704998016, + "rewards/rejected": -0.12257115542888641, + "step": 1970 + }, + { + "epoch": 0.58, + "learning_rate": 4.870865230253532e-07, + "logits/chosen": -2.7817773818969727, + "logits/rejected": -2.7752878665924072, + "logps/chosen": -197.40106201171875, + "logps/rejected": -178.65773010253906, + "loss": 0.9687, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.08426558971405029, + "rewards/margins": 0.043002355843782425, + "rewards/rejected": -0.12726792693138123, + "step": 1980 + }, + { + "epoch": 0.58, + "learning_rate": 4.868159433421485e-07, + "logits/chosen": -2.7358546257019043, + "logits/rejected": -2.738718032836914, + "logps/chosen": -201.25308227539062, + "logps/rejected": -195.88258361816406, + "loss": 0.9672, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.09167347848415375, + "rewards/margins": 0.030366484075784683, + "rewards/rejected": -0.12203995883464813, + "step": 1990 + }, + { + "epoch": 0.58, + "learning_rate": 4.865426349487108e-07, + "logits/chosen": -2.8001911640167236, + "logits/rejected": -2.7797961235046387, + "logps/chosen": -221.62557983398438, + "logps/rejected": -200.5688934326172, + "loss": 0.9655, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.08506297320127487, + "rewards/margins": 0.04144131764769554, + "rewards/rejected": -0.12650427222251892, + "step": 2000 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.685004234313965, + "eval_logits/rejected": -2.680168390274048, + "eval_logps/chosen": -196.42654418945312, + "eval_logps/rejected": -182.10122680664062, + "eval_loss": 0.9663463234901428, + "eval_rewards/accuracies": 0.5756288170814514, + "eval_rewards/chosen": -0.08212257921695709, + "eval_rewards/margins": 0.03587257117033005, + "eval_rewards/rejected": -0.11799515038728714, + "eval_runtime": 443.3293, + "eval_samples_per_second": 26.538, + "eval_steps_per_second": 3.318, + "step": 2000 + }, + { + "epoch": 0.59, + "learning_rate": 4.862666009942341e-07, + "logits/chosen": -2.7604477405548096, + "logits/rejected": -2.767010450363159, + "logps/chosen": -203.9152069091797, + "logps/rejected": -190.85159301757812, + "loss": 0.9659, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.09103678911924362, + "rewards/margins": 0.03718984127044678, + "rewards/rejected": -0.1282266229391098, + "step": 2010 + }, + { + "epoch": 0.59, + "learning_rate": 4.859878446593181e-07, + "logits/chosen": -2.7273712158203125, + "logits/rejected": -2.7808773517608643, + "logps/chosen": -181.61282348632812, + "logps/rejected": -196.00958251953125, + "loss": 0.9574, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.07144895195960999, + "rewards/margins": 0.059774626046419144, + "rewards/rejected": -0.13122357428073883, + "step": 2020 + }, + { + "epoch": 0.59, + "learning_rate": 4.857063691559309e-07, + "logits/chosen": -2.7482104301452637, + "logits/rejected": -2.7654850482940674, + "logps/chosen": -194.5123291015625, + "logps/rejected": -180.79283142089844, + "loss": 0.9564, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.08377393335103989, + "rewards/margins": 0.050066135823726654, + "rewards/rejected": -0.13384008407592773, + "step": 2030 + }, + { + "epoch": 0.6, + "learning_rate": 4.854221777273722e-07, + "logits/chosen": -2.752758741378784, + "logits/rejected": -2.755716562271118, + "logps/chosen": -170.12339782714844, + "logps/rejected": -163.80215454101562, + "loss": 0.9652, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.07802867144346237, + "rewards/margins": 0.03308725357055664, + "rewards/rejected": -0.11111593246459961, + "step": 2040 + }, + { + "epoch": 0.6, + "learning_rate": 4.851352736482359e-07, + "logits/chosen": -2.743976354598999, + "logits/rejected": -2.7437987327575684, + "logps/chosen": -209.16943359375, + "logps/rejected": -186.8437042236328, + "loss": 0.9593, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07512355595827103, + "rewards/margins": 0.04379050433635712, + "rewards/rejected": -0.11891404539346695, + "step": 2050 + }, + { + "epoch": 0.6, + "learning_rate": 4.848456602243726e-07, + "logits/chosen": -2.76847243309021, + "logits/rejected": -2.7415621280670166, + "logps/chosen": -215.87521362304688, + "logps/rejected": -188.3891143798828, + "loss": 0.9646, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.09460041671991348, + "rewards/margins": 0.02119792439043522, + "rewards/rejected": -0.11579833179712296, + "step": 2060 + }, + { + "epoch": 0.6, + "learning_rate": 4.84553340792851e-07, + "logits/chosen": -2.728362560272217, + "logits/rejected": -2.7296814918518066, + "logps/chosen": -198.44586181640625, + "logps/rejected": -184.37933349609375, + "loss": 0.971, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.10106942802667618, + "rewards/margins": 0.027541017159819603, + "rewards/rejected": -0.12861044704914093, + "step": 2070 + }, + { + "epoch": 0.61, + "learning_rate": 4.842583187219201e-07, + "logits/chosen": -2.7220025062561035, + "logits/rejected": -2.715425491333008, + "logps/chosen": -184.50009155273438, + "logps/rejected": -179.82077026367188, + "loss": 0.97, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.08983436971902847, + "rewards/margins": 0.02670256420969963, + "rewards/rejected": -0.1165369376540184, + "step": 2080 + }, + { + "epoch": 0.61, + "learning_rate": 4.839605974109698e-07, + "logits/chosen": -2.754271984100342, + "logits/rejected": -2.7299962043762207, + "logps/chosen": -193.97775268554688, + "logps/rejected": -174.15480041503906, + "loss": 0.9539, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.08133591711521149, + "rewards/margins": 0.050822339951992035, + "rewards/rejected": -0.13215824961662292, + "step": 2090 + }, + { + "epoch": 0.61, + "learning_rate": 4.836601802904922e-07, + "logits/chosen": -2.7427401542663574, + "logits/rejected": -2.75071382522583, + "logps/chosen": -188.09854125976562, + "logps/rejected": -172.60829162597656, + "loss": 0.9719, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.09669375419616699, + "rewards/margins": 0.03804296255111694, + "rewards/rejected": -0.13473671674728394, + "step": 2100 + }, + { + "epoch": 0.61, + "eval_logits/chosen": -2.6727218627929688, + "eval_logits/rejected": -2.6676571369171143, + "eval_logps/chosen": -196.51327514648438, + "eval_logps/rejected": -182.2023468017578, + "eval_loss": 0.9645021557807922, + "eval_rewards/accuracies": 0.5676410794258118, + "eval_rewards/chosen": -0.09079542011022568, + "eval_rewards/margins": 0.037310030311346054, + "eval_rewards/rejected": -0.12810543179512024, + "eval_runtime": 443.4647, + "eval_samples_per_second": 26.53, + "eval_steps_per_second": 3.317, + "step": 2100 + }, + { + "epoch": 0.62, + "learning_rate": 4.833570708220415e-07, + "logits/chosen": -2.764371156692505, + "logits/rejected": -2.7658863067626953, + "logps/chosen": -213.55459594726562, + "logps/rejected": -204.02517700195312, + "loss": 0.9497, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.08745203167200089, + "rewards/margins": 0.05170099064707756, + "rewards/rejected": -0.13915303349494934, + "step": 2110 + }, + { + "epoch": 0.62, + "learning_rate": 4.830512724981947e-07, + "logits/chosen": -2.758815288543701, + "logits/rejected": -2.7648801803588867, + "logps/chosen": -191.48367309570312, + "logps/rejected": -177.66061401367188, + "loss": 0.9642, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.09611859172582626, + "rewards/margins": 0.026732753962278366, + "rewards/rejected": -0.12285135686397552, + "step": 2120 + }, + { + "epoch": 0.62, + "learning_rate": 4.827427888425111e-07, + "logits/chosen": -2.7500061988830566, + "logits/rejected": -2.7405474185943604, + "logps/chosen": -186.7626953125, + "logps/rejected": -175.85812377929688, + "loss": 0.9665, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.08782060444355011, + "rewards/margins": 0.04283968359231949, + "rewards/rejected": -0.130660280585289, + "step": 2130 + }, + { + "epoch": 0.62, + "learning_rate": 4.824316234094918e-07, + "logits/chosen": -2.7345998287200928, + "logits/rejected": -2.734236478805542, + "logps/chosen": -179.12454223632812, + "logps/rejected": -177.08853149414062, + "loss": 0.9586, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.08842668682336807, + "rewards/margins": 0.031327299773693085, + "rewards/rejected": -0.11975400149822235, + "step": 2140 + }, + { + "epoch": 0.63, + "learning_rate": 4.821177797845383e-07, + "logits/chosen": -2.7678306102752686, + "logits/rejected": -2.7621452808380127, + "logps/chosen": -210.3707275390625, + "logps/rejected": -183.60426330566406, + "loss": 0.9621, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.09107877314090729, + "rewards/margins": 0.04685630649328232, + "rewards/rejected": -0.1379350870847702, + "step": 2150 + }, + { + "epoch": 0.63, + "learning_rate": 4.818012615839122e-07, + "logits/chosen": -2.7913317680358887, + "logits/rejected": -2.773393154144287, + "logps/chosen": -204.22744750976562, + "logps/rejected": -177.96424865722656, + "loss": 0.9603, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.10166385024785995, + "rewards/margins": 0.01808108761906624, + "rewards/rejected": -0.11974494159221649, + "step": 2160 + }, + { + "epoch": 0.63, + "learning_rate": 4.814820724546923e-07, + "logits/chosen": -2.7454721927642822, + "logits/rejected": -2.732819080352783, + "logps/chosen": -180.0576171875, + "logps/rejected": -170.5610809326172, + "loss": 0.9626, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.09514407813549042, + "rewards/margins": 0.03504454344511032, + "rewards/rejected": -0.13018861413002014, + "step": 2170 + }, + { + "epoch": 0.64, + "learning_rate": 4.811602160747332e-07, + "logits/chosen": -2.7451462745666504, + "logits/rejected": -2.7270007133483887, + "logps/chosen": -206.5254364013672, + "logps/rejected": -194.58749389648438, + "loss": 0.9776, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": -0.1110968142747879, + "rewards/margins": 0.020342206582427025, + "rewards/rejected": -0.13143901526927948, + "step": 2180 + }, + { + "epoch": 0.64, + "learning_rate": 4.808356961526233e-07, + "logits/chosen": -2.7640247344970703, + "logits/rejected": -2.760708808898926, + "logps/chosen": -213.62130737304688, + "logps/rejected": -186.7675018310547, + "loss": 0.9709, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.10413794219493866, + "rewards/margins": 0.03650267794728279, + "rewards/rejected": -0.14064063131809235, + "step": 2190 + }, + { + "epoch": 0.64, + "learning_rate": 4.805085164276413e-07, + "logits/chosen": -2.7213828563690186, + "logits/rejected": -2.7300868034362793, + "logps/chosen": -202.58807373046875, + "logps/rejected": -183.22645568847656, + "loss": 0.9576, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.09867699444293976, + "rewards/margins": 0.04763052612543106, + "rewards/rejected": -0.14630751311779022, + "step": 2200 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -2.6729633808135986, + "eval_logits/rejected": -2.6679437160491943, + "eval_logps/chosen": -196.5585479736328, + "eval_logps/rejected": -182.2709197998047, + "eval_loss": 0.9625018239021301, + "eval_rewards/accuracies": 0.5729095935821533, + "eval_rewards/chosen": -0.09532498568296432, + "eval_rewards/margins": 0.03963753208518028, + "eval_rewards/rejected": -0.1349625140428543, + "eval_runtime": 443.5452, + "eval_samples_per_second": 26.525, + "eval_steps_per_second": 3.316, + "step": 2200 + }, + { + "epoch": 0.64, + "learning_rate": 4.801786806697134e-07, + "logits/chosen": -2.754920721054077, + "logits/rejected": -2.762866973876953, + "logps/chosen": -196.8423309326172, + "logps/rejected": -195.01043701171875, + "loss": 0.9552, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.1039787083864212, + "rewards/margins": 0.040667999535799026, + "rewards/rejected": -0.14464668929576874, + "step": 2210 + }, + { + "epoch": 0.65, + "learning_rate": 4.798461926793703e-07, + "logits/chosen": -2.7615623474121094, + "logits/rejected": -2.7736001014709473, + "logps/chosen": -209.5021209716797, + "logps/rejected": -202.2715301513672, + "loss": 0.9615, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.1150670200586319, + "rewards/margins": 0.040112510323524475, + "rewards/rejected": -0.15517953038215637, + "step": 2220 + }, + { + "epoch": 0.65, + "learning_rate": 4.795110562877026e-07, + "logits/chosen": -2.7603373527526855, + "logits/rejected": -2.7503154277801514, + "logps/chosen": -187.4466552734375, + "logps/rejected": -181.63955688476562, + "loss": 0.9569, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.11598215252161026, + "rewards/margins": 0.035527873784303665, + "rewards/rejected": -0.15151001513004303, + "step": 2230 + }, + { + "epoch": 0.65, + "learning_rate": 4.791732753563174e-07, + "logits/chosen": -2.7604167461395264, + "logits/rejected": -2.7718167304992676, + "logps/chosen": -192.2855682373047, + "logps/rejected": -194.53607177734375, + "loss": 0.9622, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.12070713192224503, + "rewards/margins": 0.022978540509939194, + "rewards/rejected": -0.14368568360805511, + "step": 2240 + }, + { + "epoch": 0.66, + "learning_rate": 4.788328537772933e-07, + "logits/chosen": -2.78924298286438, + "logits/rejected": -2.799626350402832, + "logps/chosen": -204.7958984375, + "logps/rejected": -193.9917449951172, + "loss": 0.9451, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.09752093255519867, + "rewards/margins": 0.055007584393024445, + "rewards/rejected": -0.15252849459648132, + "step": 2250 + }, + { + "epoch": 0.66, + "learning_rate": 4.784897954731359e-07, + "logits/chosen": -2.73913836479187, + "logits/rejected": -2.7504515647888184, + "logps/chosen": -188.52005004882812, + "logps/rejected": -196.14346313476562, + "loss": 0.9666, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.09183724224567413, + "rewards/margins": 0.04959854856133461, + "rewards/rejected": -0.14143578708171844, + "step": 2260 + }, + { + "epoch": 0.66, + "learning_rate": 4.781441043967323e-07, + "logits/chosen": -2.7440826892852783, + "logits/rejected": -2.760591983795166, + "logps/chosen": -203.64828491210938, + "logps/rejected": -200.84938049316406, + "loss": 0.9672, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.11135001480579376, + "rewards/margins": 0.037941206246614456, + "rewards/rejected": -0.14929120242595673, + "step": 2270 + }, + { + "epoch": 0.67, + "learning_rate": 4.777957845313058e-07, + "logits/chosen": -2.7185616493225098, + "logits/rejected": -2.7239575386047363, + "logps/chosen": -179.6100311279297, + "logps/rejected": -172.4264678955078, + "loss": 0.9655, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.10776621103286743, + "rewards/margins": 0.03039861097931862, + "rewards/rejected": -0.13816483318805695, + "step": 2280 + }, + { + "epoch": 0.67, + "learning_rate": 4.7744483989037e-07, + "logits/chosen": -2.7633557319641113, + "logits/rejected": -2.7489027976989746, + "logps/chosen": -192.33351135253906, + "logps/rejected": -172.5152130126953, + "loss": 0.9547, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.10020127147436142, + "rewards/margins": 0.042753200978040695, + "rewards/rejected": -0.142954483628273, + "step": 2290 + }, + { + "epoch": 0.67, + "learning_rate": 4.770912745176822e-07, + "logits/chosen": -2.7870826721191406, + "logits/rejected": -2.753652572631836, + "logps/chosen": -221.02529907226562, + "logps/rejected": -181.46377563476562, + "loss": 0.9619, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.10642800480127335, + "rewards/margins": 0.022537609562277794, + "rewards/rejected": -0.1289656013250351, + "step": 2300 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -2.6579878330230713, + "eval_logits/rejected": -2.6526882648468018, + "eval_logps/chosen": -196.61697387695312, + "eval_logps/rejected": -182.3572235107422, + "eval_loss": 0.9603249430656433, + "eval_rewards/accuracies": 0.5783480405807495, + "eval_rewards/chosen": -0.10116615891456604, + "eval_rewards/margins": 0.04242768511176109, + "eval_rewards/rejected": -0.14359383285045624, + "eval_runtime": 443.5502, + "eval_samples_per_second": 26.525, + "eval_steps_per_second": 3.316, + "step": 2300 + }, + { + "epoch": 0.67, + "learning_rate": 4.7673509248719737e-07, + "logits/chosen": -2.7467334270477295, + "logits/rejected": -2.7274622917175293, + "logps/chosen": -189.3142547607422, + "logps/rejected": -175.53521728515625, + "loss": 0.9534, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.11234588921070099, + "rewards/margins": 0.029966553673148155, + "rewards/rejected": -0.1423124372959137, + "step": 2310 + }, + { + "epoch": 0.68, + "learning_rate": 4.763762979030205e-07, + "logits/chosen": -2.7524502277374268, + "logits/rejected": -2.760484218597412, + "logps/chosen": -206.99856567382812, + "logps/rejected": -195.05409240722656, + "loss": 0.9601, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.08634912222623825, + "rewards/margins": 0.04963558167219162, + "rewards/rejected": -0.13598468899726868, + "step": 2320 + }, + { + "epoch": 0.68, + "learning_rate": 4.760148948993601e-07, + "logits/chosen": -2.780813217163086, + "logits/rejected": -2.7848782539367676, + "logps/chosen": -194.63572692871094, + "logps/rejected": -186.0827178955078, + "loss": 0.9648, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11056170612573624, + "rewards/margins": 0.04150097444653511, + "rewards/rejected": -0.15206268429756165, + "step": 2330 + }, + { + "epoch": 0.68, + "learning_rate": 4.7565088764047993e-07, + "logits/chosen": -2.740149974822998, + "logits/rejected": -2.730266571044922, + "logps/chosen": -185.96807861328125, + "logps/rejected": -171.44134521484375, + "loss": 0.9529, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.10003244876861572, + "rewards/margins": 0.04513604938983917, + "rewards/rejected": -0.1451684981584549, + "step": 2340 + }, + { + "epoch": 0.69, + "learning_rate": 4.752842803206515e-07, + "logits/chosen": -2.768859624862671, + "logits/rejected": -2.765820264816284, + "logps/chosen": -183.59072875976562, + "logps/rejected": -177.88339233398438, + "loss": 0.9768, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.13758380711078644, + "rewards/margins": 0.017323657870292664, + "rewards/rejected": -0.1549074649810791, + "step": 2350 + }, + { + "epoch": 0.69, + "learning_rate": 4.749150771641053e-07, + "logits/chosen": -2.745387315750122, + "logits/rejected": -2.743863582611084, + "logps/chosen": -192.3404083251953, + "logps/rejected": -175.23135375976562, + "loss": 0.9608, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.09886828809976578, + "rewards/margins": 0.03992761671543121, + "rewards/rejected": -0.1387959122657776, + "step": 2360 + }, + { + "epoch": 0.69, + "learning_rate": 4.7454328242498243e-07, + "logits/chosen": -2.7137324810028076, + "logits/rejected": -2.741093158721924, + "logps/chosen": -190.41773986816406, + "logps/rejected": -194.13595581054688, + "loss": 0.9667, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.12575829029083252, + "rewards/margins": 0.03708643093705177, + "rewards/rejected": -0.1628447026014328, + "step": 2370 + }, + { + "epoch": 0.69, + "learning_rate": 4.7416890038728543e-07, + "logits/chosen": -2.7767727375030518, + "logits/rejected": -2.7499849796295166, + "logps/chosen": -216.6398468017578, + "logps/rejected": -189.55067443847656, + "loss": 0.952, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.10577182471752167, + "rewards/margins": 0.04436280578374863, + "rewards/rejected": -0.1501346379518509, + "step": 2380 + }, + { + "epoch": 0.7, + "learning_rate": 4.73791935364829e-07, + "logits/chosen": -2.7677741050720215, + "logits/rejected": -2.7710769176483154, + "logps/chosen": -196.3310089111328, + "logps/rejected": -189.33282470703125, + "loss": 0.9504, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.11472728103399277, + "rewards/margins": 0.04461818188428879, + "rewards/rejected": -0.15934546291828156, + "step": 2390 + }, + { + "epoch": 0.7, + "learning_rate": 4.734123917011903e-07, + "logits/chosen": -2.7767577171325684, + "logits/rejected": -2.7674083709716797, + "logps/chosen": -196.1704864501953, + "logps/rejected": -187.2528533935547, + "loss": 0.9511, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.12227902561426163, + "rewards/margins": 0.048698049038648605, + "rewards/rejected": -0.17097707092761993, + "step": 2400 + }, + { + "epoch": 0.7, + "eval_logits/chosen": -2.6617283821105957, + "eval_logits/rejected": -2.656481981277466, + "eval_logps/chosen": -196.710693359375, + "eval_logps/rejected": -182.46116638183594, + "eval_loss": 0.9600609540939331, + "eval_rewards/accuracies": 0.57222980260849, + "eval_rewards/chosen": -0.11053957790136337, + "eval_rewards/margins": 0.043447259813547134, + "eval_rewards/rejected": -0.1539868265390396, + "eval_runtime": 443.5718, + "eval_samples_per_second": 26.523, + "eval_steps_per_second": 3.316, + "step": 2400 + }, + { + "epoch": 0.7, + "learning_rate": 4.7303027376965874e-07, + "logits/chosen": -2.746814012527466, + "logits/rejected": -2.731839895248413, + "logps/chosen": -209.3233642578125, + "logps/rejected": -186.90347290039062, + "loss": 0.9485, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.10549162328243256, + "rewards/margins": 0.0548577681183815, + "rewards/rejected": -0.16034939885139465, + "step": 2410 + }, + { + "epoch": 0.71, + "learning_rate": 4.726455859731859e-07, + "logits/chosen": -2.7607133388519287, + "logits/rejected": -2.7557640075683594, + "logps/chosen": -180.23956298828125, + "logps/rejected": -178.86196899414062, + "loss": 0.9443, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.11721359193325043, + "rewards/margins": 0.05561182647943497, + "rewards/rejected": -0.172825425863266, + "step": 2420 + }, + { + "epoch": 0.71, + "learning_rate": 4.7225833274433455e-07, + "logits/chosen": -2.7584927082061768, + "logits/rejected": -2.7558186054229736, + "logps/chosen": -189.36634826660156, + "logps/rejected": -185.4894256591797, + "loss": 0.9589, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.11889169365167618, + "rewards/margins": 0.04156076908111572, + "rewards/rejected": -0.1604524850845337, + "step": 2430 + }, + { + "epoch": 0.71, + "learning_rate": 4.718685185452275e-07, + "logits/chosen": -2.7491304874420166, + "logits/rejected": -2.7356982231140137, + "logps/chosen": -193.890380859375, + "logps/rejected": -172.22006225585938, + "loss": 0.9558, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.11748484522104263, + "rewards/margins": 0.04917275533080101, + "rewards/rejected": -0.16665759682655334, + "step": 2440 + }, + { + "epoch": 0.71, + "learning_rate": 4.7147614786749656e-07, + "logits/chosen": -2.785374164581299, + "logits/rejected": -2.7864601612091064, + "logps/chosen": -204.22335815429688, + "logps/rejected": -191.0725860595703, + "loss": 0.9432, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.12457302957773209, + "rewards/margins": 0.047984734177589417, + "rewards/rejected": -0.1725577563047409, + "step": 2450 + }, + { + "epoch": 0.72, + "learning_rate": 4.710812252322303e-07, + "logits/chosen": -2.7331464290618896, + "logits/rejected": -2.725174903869629, + "logps/chosen": -196.36265563964844, + "logps/rejected": -177.1366729736328, + "loss": 0.966, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.12758466601371765, + "rewards/margins": 0.020772898569703102, + "rewards/rejected": -0.1483575403690338, + "step": 2460 + }, + { + "epoch": 0.72, + "learning_rate": 4.706837551899223e-07, + "logits/chosen": -2.7659404277801514, + "logits/rejected": -2.747095823287964, + "logps/chosen": -216.7389373779297, + "logps/rejected": -189.22409057617188, + "loss": 0.9534, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12142902612686157, + "rewards/margins": 0.040458451956510544, + "rewards/rejected": -0.16188748180866241, + "step": 2470 + }, + { + "epoch": 0.72, + "learning_rate": 4.7028374232041877e-07, + "logits/chosen": -2.722733736038208, + "logits/rejected": -2.7091987133026123, + "logps/chosen": -174.03915405273438, + "logps/rejected": -176.1728057861328, + "loss": 0.9594, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.10743166506290436, + "rewards/margins": 0.04605314880609512, + "rewards/rejected": -0.15348480641841888, + "step": 2480 + }, + { + "epoch": 0.73, + "learning_rate": 4.698811912328655e-07, + "logits/chosen": -2.7615768909454346, + "logits/rejected": -2.772902011871338, + "logps/chosen": -179.90347290039062, + "logps/rejected": -184.03897094726562, + "loss": 0.9568, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.11061780154705048, + "rewards/margins": 0.05398521572351456, + "rewards/rejected": -0.16460299491882324, + "step": 2490 + }, + { + "epoch": 0.73, + "learning_rate": 4.6947610656565485e-07, + "logits/chosen": -2.7560980319976807, + "logits/rejected": -2.746638298034668, + "logps/chosen": -207.52249145507812, + "logps/rejected": -189.99624633789062, + "loss": 0.9516, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.11252021789550781, + "rewards/margins": 0.053376246243715286, + "rewards/rejected": -0.1658964604139328, + "step": 2500 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.6664488315582275, + "eval_logits/rejected": -2.6613011360168457, + "eval_logps/chosen": -196.7630157470703, + "eval_logps/rejected": -182.53892517089844, + "eval_loss": 0.957039475440979, + "eval_rewards/accuracies": 0.5715499520301819, + "eval_rewards/chosen": -0.1157722994685173, + "eval_rewards/margins": 0.04599176347255707, + "eval_rewards/rejected": -0.16176405549049377, + "eval_runtime": 443.4481, + "eval_samples_per_second": 26.531, + "eval_steps_per_second": 3.317, + "step": 2500 + }, + { + "epoch": 0.73, + "learning_rate": 4.690684929863723e-07, + "logits/chosen": -2.7579269409179688, + "logits/rejected": -2.763580083847046, + "logps/chosen": -193.8256378173828, + "logps/rejected": -179.65748596191406, + "loss": 0.967, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.1388111412525177, + "rewards/margins": 0.021634388715028763, + "rewards/rejected": -0.16044552624225616, + "step": 2510 + }, + { + "epoch": 0.74, + "learning_rate": 4.68658355191743e-07, + "logits/chosen": -2.7525057792663574, + "logits/rejected": -2.733154535293579, + "logps/chosen": -215.12722778320312, + "logps/rejected": -178.46194458007812, + "loss": 0.9572, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.12127050012350082, + "rewards/margins": 0.050067853182554245, + "rewards/rejected": -0.17133836448192596, + "step": 2520 + }, + { + "epoch": 0.74, + "learning_rate": 4.6824569790757683e-07, + "logits/chosen": -2.758488416671753, + "logits/rejected": -2.771643877029419, + "logps/chosen": -207.5240020751953, + "logps/rejected": -196.0047149658203, + "loss": 0.96, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.13123732805252075, + "rewards/margins": 0.05722743272781372, + "rewards/rejected": -0.18846476078033447, + "step": 2530 + }, + { + "epoch": 0.74, + "learning_rate": 4.678305258887151e-07, + "logits/chosen": -2.734318256378174, + "logits/rejected": -2.7332804203033447, + "logps/chosen": -198.74063110351562, + "logps/rejected": -180.7388458251953, + "loss": 0.9637, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.1199951022863388, + "rewards/margins": 0.0437748059630394, + "rewards/rejected": -0.1637698858976364, + "step": 2540 + }, + { + "epoch": 0.74, + "learning_rate": 4.674128439189745e-07, + "logits/chosen": -2.738473415374756, + "logits/rejected": -2.7565932273864746, + "logps/chosen": -199.6444549560547, + "logps/rejected": -204.21682739257812, + "loss": 0.9474, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12135820090770721, + "rewards/margins": 0.0532490611076355, + "rewards/rejected": -0.1746072620153427, + "step": 2550 + }, + { + "epoch": 0.75, + "learning_rate": 4.669926568110932e-07, + "logits/chosen": -2.737910747528076, + "logits/rejected": -2.720459461212158, + "logps/chosen": -183.45167541503906, + "logps/rejected": -174.4886474609375, + "loss": 0.9577, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.12640735507011414, + "rewards/margins": 0.04174577444791794, + "rewards/rejected": -0.16815313696861267, + "step": 2560 + }, + { + "epoch": 0.75, + "learning_rate": 4.6656996940667436e-07, + "logits/chosen": -2.728344440460205, + "logits/rejected": -2.7403762340545654, + "logps/chosen": -180.70962524414062, + "logps/rejected": -175.8680419921875, + "loss": 0.95, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.11759184300899506, + "rewards/margins": 0.053888868540525436, + "rewards/rejected": -0.17148073017597198, + "step": 2570 + }, + { + "epoch": 0.75, + "learning_rate": 4.661447865761311e-07, + "logits/chosen": -2.731985569000244, + "logits/rejected": -2.7103681564331055, + "logps/chosen": -197.1103057861328, + "logps/rejected": -169.7650909423828, + "loss": 0.9609, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.13190656900405884, + "rewards/margins": 0.030604299157857895, + "rewards/rejected": -0.16251085698604584, + "step": 2580 + }, + { + "epoch": 0.76, + "learning_rate": 4.6571711321862977e-07, + "logits/chosen": -2.730292558670044, + "logits/rejected": -2.730025053024292, + "logps/chosen": -169.65896606445312, + "logps/rejected": -171.60116577148438, + "loss": 0.9588, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.13009211421012878, + "rewards/margins": 0.03937167674303055, + "rewards/rejected": -0.16946378350257874, + "step": 2590 + }, + { + "epoch": 0.76, + "learning_rate": 4.652869542620341e-07, + "logits/chosen": -2.709564447402954, + "logits/rejected": -2.7069900035858154, + "logps/chosen": -187.7020721435547, + "logps/rejected": -160.48951721191406, + "loss": 0.9577, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.10107941925525665, + "rewards/margins": 0.047139011323451996, + "rewards/rejected": -0.14821843802928925, + "step": 2600 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -2.664644718170166, + "eval_logits/rejected": -2.6595005989074707, + "eval_logps/chosen": -196.8412628173828, + "eval_logps/rejected": -182.63865661621094, + "eval_loss": 0.9554187655448914, + "eval_rewards/accuracies": 0.5718898773193359, + "eval_rewards/chosen": -0.12359517812728882, + "eval_rewards/margins": 0.0481419675052166, + "eval_rewards/rejected": -0.1717371642589569, + "eval_runtime": 443.3935, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 2600 + }, + { + "epoch": 0.76, + "learning_rate": 4.64854314662848e-07, + "logits/chosen": -2.7113401889801025, + "logits/rejected": -2.726046323776245, + "logps/chosen": -177.04306030273438, + "logps/rejected": -169.5409698486328, + "loss": 0.9594, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.13913336396217346, + "rewards/margins": 0.03179800882935524, + "rewards/rejected": -0.1709313690662384, + "step": 2610 + }, + { + "epoch": 0.76, + "learning_rate": 4.644191994061584e-07, + "logits/chosen": -2.7433714866638184, + "logits/rejected": -2.7490386962890625, + "logps/chosen": -198.26492309570312, + "logps/rejected": -195.8823699951172, + "loss": 0.9551, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.12107695639133453, + "rewards/margins": 0.05229382589459419, + "rewards/rejected": -0.17337077856063843, + "step": 2620 + }, + { + "epoch": 0.77, + "learning_rate": 4.639816135055783e-07, + "logits/chosen": -2.7246880531311035, + "logits/rejected": -2.7300329208374023, + "logps/chosen": -187.84384155273438, + "logps/rejected": -170.01132202148438, + "loss": 0.9486, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.11736071109771729, + "rewards/margins": 0.04356401413679123, + "rewards/rejected": -0.16092471778392792, + "step": 2630 + }, + { + "epoch": 0.77, + "learning_rate": 4.635415620031885e-07, + "logits/chosen": -2.7280688285827637, + "logits/rejected": -2.7262918949127197, + "logps/chosen": -199.6859893798828, + "logps/rejected": -175.3978271484375, + "loss": 0.963, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.1443082094192505, + "rewards/margins": 0.03426407650113106, + "rewards/rejected": -0.17857226729393005, + "step": 2640 + }, + { + "epoch": 0.77, + "learning_rate": 4.6309904996947966e-07, + "logits/chosen": -2.7617874145507812, + "logits/rejected": -2.779418706893921, + "logps/chosen": -204.58631896972656, + "logps/rejected": -197.4747314453125, + "loss": 0.95, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.12481949478387833, + "rewards/margins": 0.058627624064683914, + "rewards/rejected": -0.18344712257385254, + "step": 2650 + }, + { + "epoch": 0.78, + "learning_rate": 4.626540825032939e-07, + "logits/chosen": -2.779951572418213, + "logits/rejected": -2.7412378787994385, + "logps/chosen": -213.5417022705078, + "logps/rejected": -175.49853515625, + "loss": 0.961, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.14209267497062683, + "rewards/margins": 0.045297764241695404, + "rewards/rejected": -0.18739044666290283, + "step": 2660 + }, + { + "epoch": 0.78, + "learning_rate": 4.622066647317662e-07, + "logits/chosen": -2.7636003494262695, + "logits/rejected": -2.745131731033325, + "logps/chosen": -195.9354248046875, + "logps/rejected": -181.21420288085938, + "loss": 0.962, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1278540939092636, + "rewards/margins": 0.03553072363138199, + "rewards/rejected": -0.163384810090065, + "step": 2670 + }, + { + "epoch": 0.78, + "learning_rate": 4.617568018102649e-07, + "logits/chosen": -2.8005611896514893, + "logits/rejected": -2.7828357219696045, + "logps/chosen": -214.6331329345703, + "logps/rejected": -191.2391357421875, + "loss": 0.9445, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.13087083399295807, + "rewards/margins": 0.05404329299926758, + "rewards/rejected": -0.18491411209106445, + "step": 2680 + }, + { + "epoch": 0.78, + "learning_rate": 4.613044989223327e-07, + "logits/chosen": -2.764817953109741, + "logits/rejected": -2.769461154937744, + "logps/chosen": -198.07208251953125, + "logps/rejected": -188.2695770263672, + "loss": 0.9588, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.12125615030527115, + "rewards/margins": 0.041098788380622864, + "rewards/rejected": -0.1623549461364746, + "step": 2690 + }, + { + "epoch": 0.79, + "learning_rate": 4.6084976127962694e-07, + "logits/chosen": -2.7548482418060303, + "logits/rejected": -2.7405712604522705, + "logps/chosen": -190.39096069335938, + "logps/rejected": -180.38926696777344, + "loss": 0.9471, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.12370000034570694, + "rewards/margins": 0.05232849717140198, + "rewards/rejected": -0.17602849006652832, + "step": 2700 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -2.6672322750091553, + "eval_logits/rejected": -2.6621241569519043, + "eval_logps/chosen": -196.8730926513672, + "eval_logps/rejected": -182.68402099609375, + "eval_loss": 0.9541336297988892, + "eval_rewards/accuracies": 0.5735893845558167, + "eval_rewards/chosen": -0.1267787367105484, + "eval_rewards/margins": 0.04949454218149185, + "eval_rewards/rejected": -0.17627327144145966, + "eval_runtime": 443.3932, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 2700 + }, + { + "epoch": 0.79, + "learning_rate": 4.603925941218593e-07, + "logits/chosen": -2.731152296066284, + "logits/rejected": -2.7341597080230713, + "logps/chosen": -212.43447875976562, + "logps/rejected": -197.4214630126953, + "loss": 0.9455, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1260325014591217, + "rewards/margins": 0.06015104055404663, + "rewards/rejected": -0.18618355691432953, + "step": 2710 + }, + { + "epoch": 0.79, + "learning_rate": 4.5993300271673535e-07, + "logits/chosen": -2.74223256111145, + "logits/rejected": -2.7303566932678223, + "logps/chosen": -188.73492431640625, + "logps/rejected": -185.53005981445312, + "loss": 0.9645, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.15222349762916565, + "rewards/margins": 0.033357687294483185, + "rewards/rejected": -0.18558119237422943, + "step": 2720 + }, + { + "epoch": 0.8, + "learning_rate": 4.5947099235989426e-07, + "logits/chosen": -2.741570472717285, + "logits/rejected": -2.756967067718506, + "logps/chosen": -202.7544403076172, + "logps/rejected": -191.4830322265625, + "loss": 0.9544, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.1407163292169571, + "rewards/margins": 0.0526912696659565, + "rewards/rejected": -0.1934075951576233, + "step": 2730 + }, + { + "epoch": 0.8, + "learning_rate": 4.590065683748476e-07, + "logits/chosen": -2.761409044265747, + "logits/rejected": -2.759230852127075, + "logps/chosen": -190.07643127441406, + "logps/rejected": -184.67315673828125, + "loss": 0.9524, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.13507774472236633, + "rewards/margins": 0.043145522475242615, + "rewards/rejected": -0.17822325229644775, + "step": 2740 + }, + { + "epoch": 0.8, + "learning_rate": 4.585397361129177e-07, + "logits/chosen": -2.7489280700683594, + "logits/rejected": -2.730376720428467, + "logps/chosen": -191.56483459472656, + "logps/rejected": -171.06887817382812, + "loss": 0.9452, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.13877546787261963, + "rewards/margins": 0.05284839868545532, + "rewards/rejected": -0.19162388145923615, + "step": 2750 + }, + { + "epoch": 0.81, + "learning_rate": 4.5807050095317643e-07, + "logits/chosen": -2.7506136894226074, + "logits/rejected": -2.760101795196533, + "logps/chosen": -185.88739013671875, + "logps/rejected": -184.04281616210938, + "loss": 0.9573, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": -0.15197591483592987, + "rewards/margins": 0.04332758113741875, + "rewards/rejected": -0.1953035145998001, + "step": 2760 + }, + { + "epoch": 0.81, + "learning_rate": 4.575988683023831e-07, + "logits/chosen": -2.750823736190796, + "logits/rejected": -2.737844467163086, + "logps/chosen": -186.29476928710938, + "logps/rejected": -161.99533081054688, + "loss": 0.9633, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.1368028223514557, + "rewards/margins": 0.04006402567028999, + "rewards/rejected": -0.17686684429645538, + "step": 2770 + }, + { + "epoch": 0.81, + "learning_rate": 4.5712484359492185e-07, + "logits/chosen": -2.755030870437622, + "logits/rejected": -2.7267005443573, + "logps/chosen": -188.34628295898438, + "logps/rejected": -176.9173583984375, + "loss": 0.9636, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.14116336405277252, + "rewards/margins": 0.041894249618053436, + "rewards/rejected": -0.18305759131908417, + "step": 2780 + }, + { + "epoch": 0.81, + "learning_rate": 4.5664843229273954e-07, + "logits/chosen": -2.727405309677124, + "logits/rejected": -2.7636685371398926, + "logps/chosen": -185.30831909179688, + "logps/rejected": -189.11410522460938, + "loss": 0.9475, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.11881668865680695, + "rewards/margins": 0.062104128301143646, + "rewards/rejected": -0.18092080950737, + "step": 2790 + }, + { + "epoch": 0.82, + "learning_rate": 4.561696398852823e-07, + "logits/chosen": -2.721590995788574, + "logits/rejected": -2.7140235900878906, + "logps/chosen": -187.95919799804688, + "logps/rejected": -173.23416137695312, + "loss": 0.9519, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.13266701996326447, + "rewards/margins": 0.05463450402021408, + "rewards/rejected": -0.18730153143405914, + "step": 2800 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -2.681025981903076, + "eval_logits/rejected": -2.6762020587921143, + "eval_logps/chosen": -196.94139099121094, + "eval_logps/rejected": -182.77047729492188, + "eval_loss": 0.9523664712905884, + "eval_rewards/accuracies": 0.5737593770027161, + "eval_rewards/chosen": -0.13360761106014252, + "eval_rewards/margins": 0.051313430070877075, + "eval_rewards/rejected": -0.1849210411310196, + "eval_runtime": 443.4513, + "eval_samples_per_second": 26.531, + "eval_steps_per_second": 3.317, + "step": 2800 + }, + { + "epoch": 0.82, + "learning_rate": 4.556884718894327e-07, + "logits/chosen": -2.7339940071105957, + "logits/rejected": -2.716157913208008, + "logps/chosen": -201.26852416992188, + "logps/rejected": -188.23138427734375, + "loss": 0.9512, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.14071759581565857, + "rewards/margins": 0.039004649966955185, + "rewards/rejected": -0.17972226440906525, + "step": 2810 + }, + { + "epoch": 0.82, + "learning_rate": 4.5520493384944614e-07, + "logits/chosen": -2.7608866691589355, + "logits/rejected": -2.7515382766723633, + "logps/chosen": -208.635009765625, + "logps/rejected": -188.59561157226562, + "loss": 0.9467, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.12841913104057312, + "rewards/margins": 0.06464473158121109, + "rewards/rejected": -0.193063884973526, + "step": 2820 + }, + { + "epoch": 0.83, + "learning_rate": 4.547190313368865e-07, + "logits/chosen": -2.750911235809326, + "logits/rejected": -2.7666754722595215, + "logps/chosen": -187.2324981689453, + "logps/rejected": -188.1076202392578, + "loss": 0.9577, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.13645856082439423, + "rewards/margins": 0.05990775674581528, + "rewards/rejected": -0.1963663399219513, + "step": 2830 + }, + { + "epoch": 0.83, + "learning_rate": 4.5423076995056255e-07, + "logits/chosen": -2.72402024269104, + "logits/rejected": -2.7575836181640625, + "logps/chosen": -180.29605102539062, + "logps/rejected": -186.4771728515625, + "loss": 0.9581, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1337813436985016, + "rewards/margins": 0.03869408741593361, + "rewards/rejected": -0.1724754273891449, + "step": 2840 + }, + { + "epoch": 0.83, + "learning_rate": 4.537401553164629e-07, + "logits/chosen": -2.7499241828918457, + "logits/rejected": -2.77961802482605, + "logps/chosen": -180.1635284423828, + "logps/rejected": -189.73565673828125, + "loss": 0.9433, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.14217296242713928, + "rewards/margins": 0.06596062332391739, + "rewards/rejected": -0.20813357830047607, + "step": 2850 + }, + { + "epoch": 0.83, + "learning_rate": 4.532471930876919e-07, + "logits/chosen": -2.756718158721924, + "logits/rejected": -2.7450459003448486, + "logps/chosen": -214.15200805664062, + "logps/rejected": -197.15908813476562, + "loss": 0.9513, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1372036188840866, + "rewards/margins": 0.05283326655626297, + "rewards/rejected": -0.19003687798976898, + "step": 2860 + }, + { + "epoch": 0.84, + "learning_rate": 4.527518889444035e-07, + "logits/chosen": -2.746441602706909, + "logits/rejected": -2.756840229034424, + "logps/chosen": -189.22317504882812, + "logps/rejected": -187.7545166015625, + "loss": 0.9476, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.13749714195728302, + "rewards/margins": 0.06224555894732475, + "rewards/rejected": -0.19974270462989807, + "step": 2870 + }, + { + "epoch": 0.84, + "learning_rate": 4.5225424859373684e-07, + "logits/chosen": -2.7592597007751465, + "logits/rejected": -2.7454373836517334, + "logps/chosen": -192.59786987304688, + "logps/rejected": -177.7920684814453, + "loss": 0.9703, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.14666634798049927, + "rewards/margins": 0.03436069190502167, + "rewards/rejected": -0.18102705478668213, + "step": 2880 + }, + { + "epoch": 0.84, + "learning_rate": 4.517542777697496e-07, + "logits/chosen": -2.7579264640808105, + "logits/rejected": -2.717677593231201, + "logps/chosen": -205.26953125, + "logps/rejected": -170.2499237060547, + "loss": 0.9531, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.12789583206176758, + "rewards/margins": 0.052336446940898895, + "rewards/rejected": -0.18023227155208588, + "step": 2890 + }, + { + "epoch": 0.85, + "learning_rate": 4.512519822333525e-07, + "logits/chosen": -2.740565299987793, + "logits/rejected": -2.716698169708252, + "logps/chosen": -222.6794891357422, + "logps/rejected": -183.50570678710938, + "loss": 0.9522, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.14425984025001526, + "rewards/margins": 0.031176995486021042, + "rewards/rejected": -0.1754368394613266, + "step": 2900 + }, + { + "epoch": 0.85, + "eval_logits/chosen": -2.6654722690582275, + "eval_logits/rejected": -2.660351037979126, + "eval_logps/chosen": -196.96963500976562, + "eval_logps/rejected": -182.81700134277344, + "eval_loss": 0.9514912962913513, + "eval_rewards/accuracies": 0.5723997354507446, + "eval_rewards/chosen": -0.13643421232700348, + "eval_rewards/margins": 0.05313733592629433, + "eval_rewards/rejected": -0.1895715445280075, + "eval_runtime": 443.3239, + "eval_samples_per_second": 26.538, + "eval_steps_per_second": 3.318, + "step": 2900 + }, + { + "epoch": 0.85, + "learning_rate": 4.507473677722428e-07, + "logits/chosen": -2.754894733428955, + "logits/rejected": -2.723676919937134, + "logps/chosen": -225.0426025390625, + "logps/rejected": -184.7257843017578, + "loss": 0.9566, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.13445214927196503, + "rewards/margins": 0.03854208067059517, + "rewards/rejected": -0.1729942411184311, + "step": 2910 + }, + { + "epoch": 0.85, + "learning_rate": 4.502404402008374e-07, + "logits/chosen": -2.7293784618377686, + "logits/rejected": -2.7325520515441895, + "logps/chosen": -184.0533905029297, + "logps/rejected": -168.29550170898438, + "loss": 0.9458, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.12156828492879868, + "rewards/margins": 0.05183269828557968, + "rewards/rejected": -0.17340096831321716, + "step": 2920 + }, + { + "epoch": 0.85, + "learning_rate": 4.4973120536020623e-07, + "logits/chosen": -2.7345995903015137, + "logits/rejected": -2.755232572555542, + "logps/chosen": -181.9169158935547, + "logps/rejected": -173.37290954589844, + "loss": 0.9411, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.12848785519599915, + "rewards/margins": 0.05240977555513382, + "rewards/rejected": -0.18089762330055237, + "step": 2930 + }, + { + "epoch": 0.86, + "learning_rate": 4.4921966911800446e-07, + "logits/chosen": -2.768277645111084, + "logits/rejected": -2.783160924911499, + "logps/chosen": -200.125, + "logps/rejected": -195.50100708007812, + "loss": 0.9381, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.14732368290424347, + "rewards/margins": 0.04739189147949219, + "rewards/rejected": -0.19471557438373566, + "step": 2940 + }, + { + "epoch": 0.86, + "learning_rate": 4.487058373684052e-07, + "logits/chosen": -2.760740280151367, + "logits/rejected": -2.747847557067871, + "logps/chosen": -190.6141815185547, + "logps/rejected": -180.71969604492188, + "loss": 0.936, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.13626626133918762, + "rewards/margins": 0.05940115451812744, + "rewards/rejected": -0.19566740095615387, + "step": 2950 + }, + { + "epoch": 0.86, + "learning_rate": 4.4818971603203174e-07, + "logits/chosen": -2.764630079269409, + "logits/rejected": -2.7455592155456543, + "logps/chosen": -207.49392700195312, + "logps/rejected": -179.84259033203125, + "loss": 0.9377, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.14047829806804657, + "rewards/margins": 0.05755741521716118, + "rewards/rejected": -0.19803571701049805, + "step": 2960 + }, + { + "epoch": 0.87, + "learning_rate": 4.4767131105588885e-07, + "logits/chosen": -2.747560739517212, + "logits/rejected": -2.7539334297180176, + "logps/chosen": -187.84457397460938, + "logps/rejected": -182.74752807617188, + "loss": 0.9376, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.14399246871471405, + "rewards/margins": 0.07578577101230621, + "rewards/rejected": -0.21977825462818146, + "step": 2970 + }, + { + "epoch": 0.87, + "learning_rate": 4.471506284132948e-07, + "logits/chosen": -2.7605197429656982, + "logits/rejected": -2.7485010623931885, + "logps/chosen": -203.68710327148438, + "logps/rejected": -191.30636596679688, + "loss": 0.9585, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.15663865208625793, + "rewards/margins": 0.03506855294108391, + "rewards/rejected": -0.19170720875263214, + "step": 2980 + }, + { + "epoch": 0.87, + "learning_rate": 4.466276741038118e-07, + "logits/chosen": -2.731755256652832, + "logits/rejected": -2.7256150245666504, + "logps/chosen": -183.74453735351562, + "logps/rejected": -173.6746368408203, + "loss": 0.9604, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.16540805995464325, + "rewards/margins": 0.03276645019650459, + "rewards/rejected": -0.19817450642585754, + "step": 2990 + }, + { + "epoch": 0.88, + "learning_rate": 4.461024541531779e-07, + "logits/chosen": -2.7541847229003906, + "logits/rejected": -2.7689316272735596, + "logps/chosen": -212.4781951904297, + "logps/rejected": -209.14669799804688, + "loss": 0.9414, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.1305633783340454, + "rewards/margins": 0.06333880126476288, + "rewards/rejected": -0.19390219449996948, + "step": 3000 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -2.675511360168457, + "eval_logits/rejected": -2.67060923576355, + "eval_logps/chosen": -197.00001525878906, + "eval_logps/rejected": -182.87057495117188, + "eval_loss": 0.9491233229637146, + "eval_rewards/accuracies": 0.5744391679763794, + "eval_rewards/chosen": -0.13946956396102905, + "eval_rewards/margins": 0.05546097829937935, + "eval_rewards/rejected": -0.1949305385351181, + "eval_runtime": 443.5187, + "eval_samples_per_second": 26.527, + "eval_steps_per_second": 3.317, + "step": 3000 + }, + { + "epoch": 0.88, + "learning_rate": 4.455749746132366e-07, + "logits/chosen": -2.7444159984588623, + "logits/rejected": -2.7448792457580566, + "logps/chosen": -190.2972412109375, + "logps/rejected": -169.44912719726562, + "loss": 0.9336, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.14413055777549744, + "rewards/margins": 0.049498431384563446, + "rewards/rejected": -0.19362899661064148, + "step": 3010 + }, + { + "epoch": 0.88, + "learning_rate": 4.4504524156186763e-07, + "logits/chosen": -2.7625129222869873, + "logits/rejected": -2.753511905670166, + "logps/chosen": -179.7624053955078, + "logps/rejected": -167.31228637695312, + "loss": 0.9557, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13771577179431915, + "rewards/margins": 0.04483487457036972, + "rewards/rejected": -0.18255063891410828, + "step": 3020 + }, + { + "epoch": 0.88, + "learning_rate": 4.4451326110291675e-07, + "logits/chosen": -2.7575528621673584, + "logits/rejected": -2.7579586505889893, + "logps/chosen": -196.14010620117188, + "logps/rejected": -191.85595703125, + "loss": 0.942, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.15023425221443176, + "rewards/margins": 0.06083030626177788, + "rewards/rejected": -0.21106454730033875, + "step": 3030 + }, + { + "epoch": 0.89, + "learning_rate": 4.439790393661256e-07, + "logits/chosen": -2.7624666690826416, + "logits/rejected": -2.76863956451416, + "logps/chosen": -193.34310913085938, + "logps/rejected": -193.1276092529297, + "loss": 0.9555, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.15546974539756775, + "rewards/margins": 0.06791901588439941, + "rewards/rejected": -0.22338874638080597, + "step": 3040 + }, + { + "epoch": 0.89, + "learning_rate": 4.4344258250706066e-07, + "logits/chosen": -2.7380971908569336, + "logits/rejected": -2.7569963932037354, + "logps/chosen": -181.44793701171875, + "logps/rejected": -173.0232391357422, + "loss": 0.9425, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.14029046893119812, + "rewards/margins": 0.06983927637338638, + "rewards/rejected": -0.2101297378540039, + "step": 3050 + }, + { + "epoch": 0.89, + "learning_rate": 4.429038967070429e-07, + "logits/chosen": -2.748533010482788, + "logits/rejected": -2.738431215286255, + "logps/chosen": -197.2476348876953, + "logps/rejected": -192.76535034179688, + "loss": 0.9452, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.15091857314109802, + "rewards/margins": 0.05640494078397751, + "rewards/rejected": -0.20732350647449493, + "step": 3060 + }, + { + "epoch": 0.9, + "learning_rate": 4.423629881730759e-07, + "logits/chosen": -2.7502188682556152, + "logits/rejected": -2.7514498233795166, + "logps/chosen": -206.1704864501953, + "logps/rejected": -199.50942993164062, + "loss": 0.9488, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.16290298104286194, + "rewards/margins": 0.060902971774339676, + "rewards/rejected": -0.22380594909191132, + "step": 3070 + }, + { + "epoch": 0.9, + "learning_rate": 4.418198631377751e-07, + "logits/chosen": -2.752500295639038, + "logits/rejected": -2.7435920238494873, + "logps/chosen": -190.84780883789062, + "logps/rejected": -180.72679138183594, + "loss": 0.9516, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.13359172642230988, + "rewards/margins": 0.06508227437734604, + "rewards/rejected": -0.19867399334907532, + "step": 3080 + }, + { + "epoch": 0.9, + "learning_rate": 4.412745278592952e-07, + "logits/chosen": -2.7263596057891846, + "logits/rejected": -2.739581346511841, + "logps/chosen": -199.06126403808594, + "logps/rejected": -200.32713317871094, + "loss": 0.9414, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1425991952419281, + "rewards/margins": 0.06533292680978775, + "rewards/rejected": -0.20793208479881287, + "step": 3090 + }, + { + "epoch": 0.9, + "learning_rate": 4.407269886212586e-07, + "logits/chosen": -2.72841215133667, + "logits/rejected": -2.7648680210113525, + "logps/chosen": -184.0097198486328, + "logps/rejected": -183.879150390625, + "loss": 0.9509, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.1576574146747589, + "rewards/margins": 0.05096912384033203, + "rewards/rejected": -0.20862650871276855, + "step": 3100 + }, + { + "epoch": 0.9, + "eval_logits/chosen": -2.6625335216522217, + "eval_logits/rejected": -2.657381534576416, + "eval_logps/chosen": -197.05508422851562, + "eval_logps/rejected": -182.9411163330078, + "eval_loss": 0.9482882618904114, + "eval_rewards/accuracies": 0.5798776149749756, + "eval_rewards/chosen": -0.14497847855091095, + "eval_rewards/margins": 0.057004500180482864, + "eval_rewards/rejected": -0.2019829899072647, + "eval_runtime": 443.4978, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 3100 + }, + { + "epoch": 0.91, + "learning_rate": 4.401772517326827e-07, + "logits/chosen": -2.743058204650879, + "logits/rejected": -2.736036777496338, + "logps/chosen": -200.5432586669922, + "logps/rejected": -182.67279052734375, + "loss": 0.9512, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.16279372572898865, + "rewards/margins": 0.03614386171102524, + "rewards/rejected": -0.1989375799894333, + "step": 3110 + }, + { + "epoch": 0.91, + "learning_rate": 4.3962532352790746e-07, + "logits/chosen": -2.7184717655181885, + "logits/rejected": -2.7647883892059326, + "logps/chosen": -194.09921264648438, + "logps/rejected": -216.4038848876953, + "loss": 0.9343, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.1559465378522873, + "rewards/margins": 0.08612390607595444, + "rewards/rejected": -0.24207043647766113, + "step": 3120 + }, + { + "epoch": 0.91, + "learning_rate": 4.390712103665222e-07, + "logits/chosen": -2.726545810699463, + "logits/rejected": -2.714825391769409, + "logps/chosen": -193.67857360839844, + "logps/rejected": -175.10110473632812, + "loss": 0.9599, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.15397819876670837, + "rewards/margins": 0.04705255106091499, + "rewards/rejected": -0.20103076100349426, + "step": 3130 + }, + { + "epoch": 0.92, + "learning_rate": 4.385149186332923e-07, + "logits/chosen": -2.766066551208496, + "logits/rejected": -2.7510006427764893, + "logps/chosen": -197.1205596923828, + "logps/rejected": -180.7372589111328, + "loss": 0.942, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.15654006600379944, + "rewards/margins": 0.056927651166915894, + "rewards/rejected": -0.21346771717071533, + "step": 3140 + }, + { + "epoch": 0.92, + "learning_rate": 4.379564547380858e-07, + "logits/chosen": -2.735912799835205, + "logits/rejected": -2.7432703971862793, + "logps/chosen": -193.31373596191406, + "logps/rejected": -179.3289337158203, + "loss": 0.9546, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.14612646400928497, + "rewards/margins": 0.04988929629325867, + "rewards/rejected": -0.19601577520370483, + "step": 3150 + }, + { + "epoch": 0.92, + "learning_rate": 4.373958251157995e-07, + "logits/chosen": -2.728239059448242, + "logits/rejected": -2.742741346359253, + "logps/chosen": -182.5937957763672, + "logps/rejected": -183.79812622070312, + "loss": 0.9485, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.16978135704994202, + "rewards/margins": 0.04071738198399544, + "rewards/rejected": -0.21049876511096954, + "step": 3160 + }, + { + "epoch": 0.92, + "learning_rate": 4.3683303622628467e-07, + "logits/chosen": -2.7599949836730957, + "logits/rejected": -2.7678418159484863, + "logps/chosen": -205.2645721435547, + "logps/rejected": -194.56536865234375, + "loss": 0.9322, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.14348538219928741, + "rewards/margins": 0.09094350785017014, + "rewards/rejected": -0.23442888259887695, + "step": 3170 + }, + { + "epoch": 0.93, + "learning_rate": 4.3626809455427284e-07, + "logits/chosen": -2.7088184356689453, + "logits/rejected": -2.737748622894287, + "logps/chosen": -182.29830932617188, + "logps/rejected": -185.71241760253906, + "loss": 0.9458, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1563442051410675, + "rewards/margins": 0.06264548003673553, + "rewards/rejected": -0.21898965537548065, + "step": 3180 + }, + { + "epoch": 0.93, + "learning_rate": 4.357010066093009e-07, + "logits/chosen": -2.7586493492126465, + "logits/rejected": -2.738326072692871, + "logps/chosen": -212.00320434570312, + "logps/rejected": -179.95960998535156, + "loss": 0.9345, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.13312071561813354, + "rewards/margins": 0.07820292562246323, + "rewards/rejected": -0.21132364869117737, + "step": 3190 + }, + { + "epoch": 0.93, + "learning_rate": 4.351317789256361e-07, + "logits/chosen": -2.725133180618286, + "logits/rejected": -2.711580753326416, + "logps/chosen": -213.65029907226562, + "logps/rejected": -186.83413696289062, + "loss": 0.9453, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.15071360766887665, + "rewards/margins": 0.07792054116725922, + "rewards/rejected": -0.22863414883613586, + "step": 3200 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.647813558578491, + "eval_logits/rejected": -2.6423654556274414, + "eval_logps/chosen": -197.0772247314453, + "eval_logps/rejected": -182.98223876953125, + "eval_loss": 0.9472061395645142, + "eval_rewards/accuracies": 0.5834466218948364, + "eval_rewards/chosen": -0.14719060063362122, + "eval_rewards/margins": 0.05890314280986786, + "eval_rewards/rejected": -0.20609375834465027, + "eval_runtime": 443.5189, + "eval_samples_per_second": 26.526, + "eval_steps_per_second": 3.317, + "step": 3200 + }, + { + "epoch": 0.94, + "learning_rate": 4.3456041806220105e-07, + "logits/chosen": -2.710390090942383, + "logits/rejected": -2.732726573944092, + "logps/chosen": -174.5506134033203, + "logps/rejected": -184.31295776367188, + "loss": 0.9507, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.14880771934986115, + "rewards/margins": 0.04055916517972946, + "rewards/rejected": -0.18936687707901, + "step": 3210 + }, + { + "epoch": 0.94, + "learning_rate": 4.3398693060249757e-07, + "logits/chosen": -2.7572438716888428, + "logits/rejected": -2.780435562133789, + "logps/chosen": -205.3828582763672, + "logps/rejected": -194.72537231445312, + "loss": 0.9523, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.159606471657753, + "rewards/margins": 0.03793240338563919, + "rewards/rejected": -0.19753886759281158, + "step": 3220 + }, + { + "epoch": 0.94, + "learning_rate": 4.334113231545314e-07, + "logits/chosen": -2.7653536796569824, + "logits/rejected": -2.7734832763671875, + "logps/chosen": -209.9667205810547, + "logps/rejected": -196.0996856689453, + "loss": 0.9555, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.1710664927959442, + "rewards/margins": 0.03884775564074516, + "rewards/rejected": -0.20991425216197968, + "step": 3230 + }, + { + "epoch": 0.95, + "learning_rate": 4.3283360235073584e-07, + "logits/chosen": -2.781524181365967, + "logits/rejected": -2.7697031497955322, + "logps/chosen": -213.2795867919922, + "logps/rejected": -185.36959838867188, + "loss": 0.96, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.16060218214988708, + "rewards/margins": 0.04485396295785904, + "rewards/rejected": -0.20545610785484314, + "step": 3240 + }, + { + "epoch": 0.95, + "learning_rate": 4.322537748478952e-07, + "logits/chosen": -2.727961301803589, + "logits/rejected": -2.7541677951812744, + "logps/chosen": -183.5811309814453, + "logps/rejected": -186.97708129882812, + "loss": 0.9641, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.16682754456996918, + "rewards/margins": 0.05004224181175232, + "rewards/rejected": -0.2168697863817215, + "step": 3250 + }, + { + "epoch": 0.95, + "learning_rate": 4.3167184732706825e-07, + "logits/chosen": -2.712656021118164, + "logits/rejected": -2.727965831756592, + "logps/chosen": -190.7420654296875, + "logps/rejected": -185.13853454589844, + "loss": 0.9528, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": -0.16370153427124023, + "rewards/margins": 0.04779518395662308, + "rewards/rejected": -0.2114967405796051, + "step": 3260 + }, + { + "epoch": 0.95, + "learning_rate": 4.310878264935113e-07, + "logits/chosen": -2.74177885055542, + "logits/rejected": -2.743264675140381, + "logps/chosen": -176.8439483642578, + "logps/rejected": -179.67626953125, + "loss": 0.9477, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.15423059463500977, + "rewards/margins": 0.049925509840250015, + "rewards/rejected": -0.20415611565113068, + "step": 3270 + }, + { + "epoch": 0.96, + "learning_rate": 4.305017190766006e-07, + "logits/chosen": -2.724313497543335, + "logits/rejected": -2.76788067817688, + "logps/chosen": -168.16397094726562, + "logps/rejected": -183.28399658203125, + "loss": 0.943, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.14488880336284637, + "rewards/margins": 0.07369550317525864, + "rewards/rejected": -0.21858429908752441, + "step": 3280 + }, + { + "epoch": 0.96, + "learning_rate": 4.2991353182975545e-07, + "logits/chosen": -2.7469019889831543, + "logits/rejected": -2.71189546585083, + "logps/chosen": -204.70333862304688, + "logps/rejected": -180.01437377929688, + "loss": 0.9454, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.13744714856147766, + "rewards/margins": 0.05579395964741707, + "rewards/rejected": -0.19324110448360443, + "step": 3290 + }, + { + "epoch": 0.96, + "learning_rate": 4.293232715303595e-07, + "logits/chosen": -2.7478322982788086, + "logits/rejected": -2.740370988845825, + "logps/chosen": -197.2376251220703, + "logps/rejected": -188.1304168701172, + "loss": 0.9577, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.1363435834646225, + "rewards/margins": 0.036770425736904144, + "rewards/rejected": -0.17311401665210724, + "step": 3300 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -2.662210702896118, + "eval_logits/rejected": -2.65704345703125, + "eval_logps/chosen": -197.0956268310547, + "eval_logps/rejected": -183.0017852783203, + "eval_loss": 0.9461437463760376, + "eval_rewards/accuracies": 0.5793677568435669, + "eval_rewards/chosen": -0.14903247356414795, + "eval_rewards/margins": 0.05901965871453285, + "eval_rewards/rejected": -0.2080521434545517, + "eval_runtime": 443.584, + "eval_samples_per_second": 26.523, + "eval_steps_per_second": 3.316, + "step": 3300 + }, + { + "epoch": 0.97, + "learning_rate": 4.2873094497968364e-07, + "logits/chosen": -2.7464559078216553, + "logits/rejected": -2.7323105335235596, + "logps/chosen": -203.46339416503906, + "logps/rejected": -184.08560180664062, + "loss": 0.9466, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.14876006543636322, + "rewards/margins": 0.04546096548438072, + "rewards/rejected": -0.19422104954719543, + "step": 3310 + }, + { + "epoch": 0.97, + "learning_rate": 4.281365590028067e-07, + "logits/chosen": -2.7063145637512207, + "logits/rejected": -2.7181246280670166, + "logps/chosen": -197.8404998779297, + "logps/rejected": -197.62437438964844, + "loss": 0.9218, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12893405556678772, + "rewards/margins": 0.09074068069458008, + "rewards/rejected": -0.2196747362613678, + "step": 3320 + }, + { + "epoch": 0.97, + "learning_rate": 4.2754012044853734e-07, + "logits/chosen": -2.7540042400360107, + "logits/rejected": -2.7615749835968018, + "logps/chosen": -190.9739227294922, + "logps/rejected": -186.8544158935547, + "loss": 0.9485, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.1605357974767685, + "rewards/margins": 0.049000561237335205, + "rewards/rejected": -0.2095363587141037, + "step": 3330 + }, + { + "epoch": 0.97, + "learning_rate": 4.269416361893352e-07, + "logits/chosen": -2.721992254257202, + "logits/rejected": -2.7304000854492188, + "logps/chosen": -206.56936645507812, + "logps/rejected": -191.93504333496094, + "loss": 0.9345, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.15713246166706085, + "rewards/margins": 0.05960817262530327, + "rewards/rejected": -0.2167406529188156, + "step": 3340 + }, + { + "epoch": 0.98, + "learning_rate": 4.2634111312123144e-07, + "logits/chosen": -2.729740858078003, + "logits/rejected": -2.6976797580718994, + "logps/chosen": -203.6077423095703, + "logps/rejected": -181.80348205566406, + "loss": 0.941, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.1732880175113678, + "rewards/margins": 0.04767068475484848, + "rewards/rejected": -0.22095870971679688, + "step": 3350 + }, + { + "epoch": 0.98, + "learning_rate": 4.257385581637493e-07, + "logits/chosen": -2.7473340034484863, + "logits/rejected": -2.7488176822662354, + "logps/chosen": -197.68312072753906, + "logps/rejected": -198.27090454101562, + "loss": 0.9564, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.15684029459953308, + "rewards/margins": 0.04104867950081825, + "rewards/rejected": -0.19788897037506104, + "step": 3360 + }, + { + "epoch": 0.98, + "learning_rate": 4.2513397825982463e-07, + "logits/chosen": -2.755300760269165, + "logits/rejected": -2.751363515853882, + "logps/chosen": -199.49459838867188, + "logps/rejected": -185.002685546875, + "loss": 0.9472, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.15579402446746826, + "rewards/margins": 0.06800516694784164, + "rewards/rejected": -0.2237991988658905, + "step": 3370 + }, + { + "epoch": 0.99, + "learning_rate": 4.245273803757254e-07, + "logits/chosen": -2.779808521270752, + "logits/rejected": -2.763307571411133, + "logps/chosen": -220.1051788330078, + "logps/rejected": -205.69338989257812, + "loss": 0.9421, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.14048513770103455, + "rewards/margins": 0.047556690871715546, + "rewards/rejected": -0.18804185092449188, + "step": 3380 + }, + { + "epoch": 0.99, + "learning_rate": 4.239187715009722e-07, + "logits/chosen": -2.752995014190674, + "logits/rejected": -2.762915849685669, + "logps/chosen": -217.9800567626953, + "logps/rejected": -202.8637237548828, + "loss": 0.9429, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.17609527707099915, + "rewards/margins": 0.05705328658223152, + "rewards/rejected": -0.23314857482910156, + "step": 3390 + }, + { + "epoch": 0.99, + "learning_rate": 4.2330815864825696e-07, + "logits/chosen": -2.7742648124694824, + "logits/rejected": -2.774212598800659, + "logps/chosen": -226.41854858398438, + "logps/rejected": -207.75296020507812, + "loss": 0.9374, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.15242572128772736, + "rewards/margins": 0.06987457722425461, + "rewards/rejected": -0.22230032086372375, + "step": 3400 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -2.6551883220672607, + "eval_logits/rejected": -2.649892807006836, + "eval_logps/chosen": -197.13763427734375, + "eval_logps/rejected": -183.06625366210938, + "eval_loss": 0.9452260136604309, + "eval_rewards/accuracies": 0.5769884586334229, + "eval_rewards/chosen": -0.15323300659656525, + "eval_rewards/margins": 0.061263974756002426, + "eval_rewards/rejected": -0.21449698507785797, + "eval_runtime": 443.4932, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 3400 + }, + { + "epoch": 0.99, + "learning_rate": 4.2269554885336234e-07, + "logits/chosen": -2.780066728591919, + "logits/rejected": -2.7521700859069824, + "logps/chosen": -226.03341674804688, + "logps/rejected": -197.58273315429688, + "loss": 0.9342, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.17001919448375702, + "rewards/margins": 0.0716620534658432, + "rewards/rejected": -0.24168124794960022, + "step": 3410 + }, + { + "epoch": 1.0, + "learning_rate": 4.2208094917508095e-07, + "logits/chosen": -2.7597529888153076, + "logits/rejected": -2.7499496936798096, + "logps/chosen": -178.80166625976562, + "logps/rejected": -175.30052185058594, + "loss": 0.9548, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.18084251880645752, + "rewards/margins": 0.033990949392318726, + "rewards/rejected": -0.21483345329761505, + "step": 3420 + }, + { + "epoch": 1.0, + "learning_rate": 4.214643666951338e-07, + "logits/chosen": -2.7603445053100586, + "logits/rejected": -2.770897388458252, + "logps/chosen": -203.36367797851562, + "logps/rejected": -199.06788635253906, + "loss": 0.9237, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.14491590857505798, + "rewards/margins": 0.09012307226657867, + "rewards/rejected": -0.23503899574279785, + "step": 3430 + }, + { + "epoch": 1.0, + "learning_rate": 4.2084580851808866e-07, + "logits/chosen": -2.758740186691284, + "logits/rejected": -2.7576069831848145, + "logps/chosen": -188.8221893310547, + "logps/rejected": -176.63290405273438, + "loss": 0.9499, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.15238967537879944, + "rewards/margins": 0.05928494781255722, + "rewards/rejected": -0.21167464554309845, + "step": 3440 + }, + { + "epoch": 1.01, + "learning_rate": 4.2022528177127827e-07, + "logits/chosen": -2.7145676612854004, + "logits/rejected": -2.71001935005188, + "logps/chosen": -187.4506072998047, + "logps/rejected": -177.14111328125, + "loss": 0.9288, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.16252227127552032, + "rewards/margins": 0.0730685144662857, + "rewards/rejected": -0.23559077084064484, + "step": 3450 + }, + { + "epoch": 1.01, + "learning_rate": 4.196027936047182e-07, + "logits/chosen": -2.7500064373016357, + "logits/rejected": -2.721895217895508, + "logps/chosen": -209.0021209716797, + "logps/rejected": -185.7652130126953, + "loss": 0.9354, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.16328108310699463, + "rewards/margins": 0.08029235899448395, + "rewards/rejected": -0.24357345700263977, + "step": 3460 + }, + { + "epoch": 1.01, + "learning_rate": 4.189783511910244e-07, + "logits/chosen": -2.7528345584869385, + "logits/rejected": -2.764251232147217, + "logps/chosen": -181.8857879638672, + "logps/rejected": -176.1710205078125, + "loss": 0.9325, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.14209429919719696, + "rewards/margins": 0.08681829273700714, + "rewards/rejected": -0.2289125621318817, + "step": 3470 + }, + { + "epoch": 1.02, + "learning_rate": 4.1835196172533083e-07, + "logits/chosen": -2.7215676307678223, + "logits/rejected": -2.691948175430298, + "logps/chosen": -220.85800170898438, + "logps/rejected": -192.20065307617188, + "loss": 0.9396, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16459617018699646, + "rewards/margins": 0.06939554214477539, + "rewards/rejected": -0.23399169743061066, + "step": 3480 + }, + { + "epoch": 1.02, + "learning_rate": 4.1772363242520615e-07, + "logits/chosen": -2.709977626800537, + "logits/rejected": -2.7163503170013428, + "logps/chosen": -192.06781005859375, + "logps/rejected": -177.94094848632812, + "loss": 0.9346, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.16086141765117645, + "rewards/margins": 0.07237172871828079, + "rewards/rejected": -0.23323316872119904, + "step": 3490 + }, + { + "epoch": 1.02, + "learning_rate": 4.1709337053057083e-07, + "logits/chosen": -2.753588914871216, + "logits/rejected": -2.756035566329956, + "logps/chosen": -190.80001831054688, + "logps/rejected": -191.4000701904297, + "loss": 0.9299, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.14268064498901367, + "rewards/margins": 0.08039744943380356, + "rewards/rejected": -0.22307810187339783, + "step": 3500 + }, + { + "epoch": 1.02, + "eval_logits/chosen": -2.666295051574707, + "eval_logits/rejected": -2.661217212677002, + "eval_logps/chosen": -197.17547607421875, + "eval_logps/rejected": -183.11599731445312, + "eval_loss": 0.9439437389373779, + "eval_rewards/accuracies": 0.5769884586334229, + "eval_rewards/chosen": -0.15701434016227722, + "eval_rewards/margins": 0.06245831400156021, + "eval_rewards/rejected": -0.21947267651557922, + "eval_runtime": 443.578, + "eval_samples_per_second": 26.523, + "eval_steps_per_second": 3.316, + "step": 3500 + }, + { + "epoch": 1.02, + "learning_rate": 4.164611833036136e-07, + "logits/chosen": -2.714778423309326, + "logits/rejected": -2.7263901233673096, + "logps/chosen": -201.15415954589844, + "logps/rejected": -174.53749084472656, + "loss": 0.9326, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.1574777513742447, + "rewards/margins": 0.06711246818304062, + "rewards/rejected": -0.2245902270078659, + "step": 3510 + }, + { + "epoch": 1.03, + "learning_rate": 4.1582707802870777e-07, + "logits/chosen": -2.730517864227295, + "logits/rejected": -2.725821018218994, + "logps/chosen": -210.6570281982422, + "logps/rejected": -185.85055541992188, + "loss": 0.9278, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.15748251974582672, + "rewards/margins": 0.057965390384197235, + "rewards/rejected": -0.21544790267944336, + "step": 3520 + }, + { + "epoch": 1.03, + "learning_rate": 4.151910620123276e-07, + "logits/chosen": -2.756896734237671, + "logits/rejected": -2.740691661834717, + "logps/chosen": -202.3911590576172, + "logps/rejected": -182.94061279296875, + "loss": 0.9473, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.185185968875885, + "rewards/margins": 0.03288014605641365, + "rewards/rejected": -0.21806609630584717, + "step": 3530 + }, + { + "epoch": 1.03, + "learning_rate": 4.145531425829636e-07, + "logits/chosen": -2.747807264328003, + "logits/rejected": -2.7508645057678223, + "logps/chosen": -191.0756072998047, + "logps/rejected": -178.44808959960938, + "loss": 0.9394, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.1534716784954071, + "rewards/margins": 0.0547831654548645, + "rewards/rejected": -0.2082548439502716, + "step": 3540 + }, + { + "epoch": 1.04, + "learning_rate": 4.139133270910384e-07, + "logits/chosen": -2.7193522453308105, + "logits/rejected": -2.721191644668579, + "logps/chosen": -201.148681640625, + "logps/rejected": -175.58279418945312, + "loss": 0.9385, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.14847466349601746, + "rewards/margins": 0.06052703410387039, + "rewards/rejected": -0.20900170505046844, + "step": 3550 + }, + { + "epoch": 1.04, + "learning_rate": 4.13271622908822e-07, + "logits/chosen": -2.7018117904663086, + "logits/rejected": -2.705580711364746, + "logps/chosen": -173.38417053222656, + "logps/rejected": -177.74185180664062, + "loss": 0.9373, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.1533229649066925, + "rewards/margins": 0.07361769676208496, + "rewards/rejected": -0.22694067656993866, + "step": 3560 + }, + { + "epoch": 1.04, + "learning_rate": 4.126280374303469e-07, + "logits/chosen": -2.767444372177124, + "logits/rejected": -2.7558300495147705, + "logps/chosen": -197.06289672851562, + "logps/rejected": -181.7204132080078, + "loss": 0.941, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.15346328914165497, + "rewards/margins": 0.06391434371471405, + "rewards/rejected": -0.21737761795520782, + "step": 3570 + }, + { + "epoch": 1.04, + "learning_rate": 4.1198257807132276e-07, + "logits/chosen": -2.734412670135498, + "logits/rejected": -2.743535041809082, + "logps/chosen": -186.5696563720703, + "logps/rejected": -192.47348022460938, + "loss": 0.9151, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.1432243436574936, + "rewards/margins": 0.08326814323663712, + "rewards/rejected": -0.2264924943447113, + "step": 3580 + }, + { + "epoch": 1.05, + "learning_rate": 4.11335252269051e-07, + "logits/chosen": -2.715536594390869, + "logits/rejected": -2.7261719703674316, + "logps/chosen": -189.0170135498047, + "logps/rejected": -176.01217651367188, + "loss": 0.9335, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.1607503592967987, + "rewards/margins": 0.07931359112262726, + "rewards/rejected": -0.24006398022174835, + "step": 3590 + }, + { + "epoch": 1.05, + "learning_rate": 4.1068606748233916e-07, + "logits/chosen": -2.704840898513794, + "logits/rejected": -2.71500301361084, + "logps/chosen": -195.28549194335938, + "logps/rejected": -186.14892578125, + "loss": 0.936, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.16414105892181396, + "rewards/margins": 0.054746825248003006, + "rewards/rejected": -0.21888788044452667, + "step": 3600 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -2.658426284790039, + "eval_logits/rejected": -2.6531946659088135, + "eval_logps/chosen": -197.23300170898438, + "eval_logps/rejected": -183.18643188476562, + "eval_loss": 0.9437562823295593, + "eval_rewards/accuracies": 0.5788578987121582, + "eval_rewards/chosen": -0.16277101635932922, + "eval_rewards/margins": 0.06374562531709671, + "eval_rewards/rejected": -0.22651664912700653, + "eval_runtime": 443.5371, + "eval_samples_per_second": 26.525, + "eval_steps_per_second": 3.317, + "step": 3600 + }, + { + "epoch": 1.05, + "learning_rate": 4.100350311914149e-07, + "logits/chosen": -2.742522954940796, + "logits/rejected": -2.7588882446289062, + "logps/chosen": -218.0732421875, + "logps/rejected": -205.04379272460938, + "loss": 0.9188, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.14045368134975433, + "rewards/margins": 0.11588656902313232, + "rewards/rejected": -0.25634023547172546, + "step": 3610 + }, + { + "epoch": 1.06, + "learning_rate": 4.093821508978399e-07, + "logits/chosen": -2.7190682888031006, + "logits/rejected": -2.7327933311462402, + "logps/chosen": -193.43785095214844, + "logps/rejected": -187.7996826171875, + "loss": 0.9622, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.17929328978061676, + "rewards/margins": 0.03454384207725525, + "rewards/rejected": -0.2138371467590332, + "step": 3620 + }, + { + "epoch": 1.06, + "learning_rate": 4.087274341244232e-07, + "logits/chosen": -2.733224868774414, + "logits/rejected": -2.7335586547851562, + "logps/chosen": -196.0847625732422, + "logps/rejected": -187.5789031982422, + "loss": 0.9411, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.17219600081443787, + "rewards/margins": 0.07483509182929993, + "rewards/rejected": -0.24703112244606018, + "step": 3630 + }, + { + "epoch": 1.06, + "learning_rate": 4.0807088841513473e-07, + "logits/chosen": -2.76577091217041, + "logits/rejected": -2.7651515007019043, + "logps/chosen": -209.15121459960938, + "logps/rejected": -203.23410034179688, + "loss": 0.9332, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.14608857035636902, + "rewards/margins": 0.09846082329750061, + "rewards/rejected": -0.24454942345619202, + "step": 3640 + }, + { + "epoch": 1.06, + "learning_rate": 4.074125213350184e-07, + "logits/chosen": -2.7061800956726074, + "logits/rejected": -2.7219319343566895, + "logps/chosen": -189.35731506347656, + "logps/rejected": -191.7716522216797, + "loss": 0.9241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14296016097068787, + "rewards/margins": 0.09905602782964706, + "rewards/rejected": -0.24201619625091553, + "step": 3650 + }, + { + "epoch": 1.07, + "learning_rate": 4.0675234047010475e-07, + "logits/chosen": -2.757671356201172, + "logits/rejected": -2.7647621631622314, + "logps/chosen": -211.3065643310547, + "logps/rejected": -206.5069122314453, + "loss": 0.9335, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.13889974355697632, + "rewards/margins": 0.09706652909517288, + "rewards/rejected": -0.2359662801027298, + "step": 3660 + }, + { + "epoch": 1.07, + "learning_rate": 4.0609035342732374e-07, + "logits/chosen": -2.792123556137085, + "logits/rejected": -2.7674877643585205, + "logps/chosen": -222.4933624267578, + "logps/rejected": -188.7619171142578, + "loss": 0.9397, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19772221148014069, + "rewards/margins": 0.05392972379922867, + "rewards/rejected": -0.25165191292762756, + "step": 3670 + }, + { + "epoch": 1.07, + "learning_rate": 4.0542656783441685e-07, + "logits/chosen": -2.74934458732605, + "logits/rejected": -2.7401318550109863, + "logps/chosen": -199.81399536132812, + "logps/rejected": -187.61581420898438, + "loss": 0.9394, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.18191394209861755, + "rewards/margins": 0.0601019486784935, + "rewards/rejected": -0.24201588332653046, + "step": 3680 + }, + { + "epoch": 1.08, + "learning_rate": 4.047609913398496e-07, + "logits/chosen": -2.7838268280029297, + "logits/rejected": -2.793952703475952, + "logps/chosen": -219.28515625, + "logps/rejected": -220.8399658203125, + "loss": 0.925, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.18908637762069702, + "rewards/margins": 0.0818186104297638, + "rewards/rejected": -0.2709049582481384, + "step": 3690 + }, + { + "epoch": 1.08, + "learning_rate": 4.0409363161272283e-07, + "logits/chosen": -2.730079174041748, + "logits/rejected": -2.7240207195281982, + "logps/chosen": -204.08016967773438, + "logps/rejected": -179.5956573486328, + "loss": 0.9435, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.17038798332214355, + "rewards/margins": 0.05574061721563339, + "rewards/rejected": -0.22612860798835754, + "step": 3700 + }, + { + "epoch": 1.08, + "eval_logits/chosen": -2.6722583770751953, + "eval_logits/rejected": -2.6673152446746826, + "eval_logps/chosen": -197.2606658935547, + "eval_logps/rejected": -183.226318359375, + "eval_loss": 0.9420310258865356, + "eval_rewards/accuracies": 0.5807273983955383, + "eval_rewards/chosen": -0.16553470492362976, + "eval_rewards/margins": 0.06496822834014893, + "eval_rewards/rejected": -0.2305029332637787, + "eval_runtime": 443.582, + "eval_samples_per_second": 26.523, + "eval_steps_per_second": 3.316, + "step": 3700 + }, + { + "epoch": 1.08, + "learning_rate": 4.03424496342685e-07, + "logits/chosen": -2.7483789920806885, + "logits/rejected": -2.7456538677215576, + "logps/chosen": -175.35150146484375, + "logps/rejected": -163.5673065185547, + "loss": 0.9311, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.1589987576007843, + "rewards/margins": 0.056345902383327484, + "rewards/rejected": -0.21534466743469238, + "step": 3710 + }, + { + "epoch": 1.09, + "learning_rate": 4.02753593239843e-07, + "logits/chosen": -2.747954845428467, + "logits/rejected": -2.7363548278808594, + "logps/chosen": -216.8974609375, + "logps/rejected": -190.09544372558594, + "loss": 0.9216, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.17782175540924072, + "rewards/margins": 0.06778020411729813, + "rewards/rejected": -0.24560198187828064, + "step": 3720 + }, + { + "epoch": 1.09, + "learning_rate": 4.0208093003467366e-07, + "logits/chosen": -2.734060525894165, + "logits/rejected": -2.7579092979431152, + "logps/chosen": -182.96588134765625, + "logps/rejected": -188.7209930419922, + "loss": 0.9241, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.17049555480480194, + "rewards/margins": 0.07502481341362, + "rewards/rejected": -0.24552038311958313, + "step": 3730 + }, + { + "epoch": 1.09, + "learning_rate": 4.014065144779345e-07, + "logits/chosen": -2.7681238651275635, + "logits/rejected": -2.757328748703003, + "logps/chosen": -201.3914794921875, + "logps/rejected": -186.22134399414062, + "loss": 0.9316, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.15492196381092072, + "rewards/margins": 0.07273847609758377, + "rewards/rejected": -0.2276604175567627, + "step": 3740 + }, + { + "epoch": 1.09, + "learning_rate": 4.0073035434057477e-07, + "logits/chosen": -2.7487292289733887, + "logits/rejected": -2.7397453784942627, + "logps/chosen": -182.2095184326172, + "logps/rejected": -174.14088439941406, + "loss": 0.9317, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.14754052460193634, + "rewards/margins": 0.0717361718416214, + "rewards/rejected": -0.21927671134471893, + "step": 3750 + }, + { + "epoch": 1.1, + "learning_rate": 4.0005245741364537e-07, + "logits/chosen": -2.7562036514282227, + "logits/rejected": -2.7581675052642822, + "logps/chosen": -207.8866729736328, + "logps/rejected": -186.6951141357422, + "loss": 0.9442, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.16531476378440857, + "rewards/margins": 0.05598212406039238, + "rewards/rejected": -0.22129687666893005, + "step": 3760 + }, + { + "epoch": 1.1, + "learning_rate": 3.9937283150820935e-07, + "logits/chosen": -2.742161273956299, + "logits/rejected": -2.717721939086914, + "logps/chosen": -191.58201599121094, + "logps/rejected": -175.09475708007812, + "loss": 0.9318, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.16936281323432922, + "rewards/margins": 0.06095327064394951, + "rewards/rejected": -0.23031607270240784, + "step": 3770 + }, + { + "epoch": 1.1, + "learning_rate": 3.9869148445525195e-07, + "logits/chosen": -2.7626867294311523, + "logits/rejected": -2.7525551319122314, + "logps/chosen": -212.2379913330078, + "logps/rejected": -193.54745483398438, + "loss": 0.9094, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.17974770069122314, + "rewards/margins": 0.08587310463190079, + "rewards/rejected": -0.26562076807022095, + "step": 3780 + }, + { + "epoch": 1.11, + "learning_rate": 3.980084241055905e-07, + "logits/chosen": -2.7718071937561035, + "logits/rejected": -2.7361700534820557, + "logps/chosen": -220.29745483398438, + "logps/rejected": -180.39019775390625, + "loss": 0.9433, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.18197306990623474, + "rewards/margins": 0.05966731905937195, + "rewards/rejected": -0.2416403740644455, + "step": 3790 + }, + { + "epoch": 1.11, + "learning_rate": 3.973236583297835e-07, + "logits/chosen": -2.736232042312622, + "logits/rejected": -2.781214475631714, + "logps/chosen": -199.220947265625, + "logps/rejected": -213.5320281982422, + "loss": 0.9341, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16228926181793213, + "rewards/margins": 0.1069878339767456, + "rewards/rejected": -0.26927709579467773, + "step": 3800 + }, + { + "epoch": 1.11, + "eval_logits/chosen": -2.663583993911743, + "eval_logits/rejected": -2.6584584712982178, + "eval_logps/chosen": -197.3029327392578, + "eval_logps/rejected": -183.2721405029297, + "eval_loss": 0.9421671032905579, + "eval_rewards/accuracies": 0.581237256526947, + "eval_rewards/chosen": -0.1697637140750885, + "eval_rewards/margins": 0.0653214231133461, + "eval_rewards/rejected": -0.2350851446390152, + "eval_runtime": 443.4935, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 3800 + }, + { + "epoch": 1.11, + "learning_rate": 3.966371950180404e-07, + "logits/chosen": -2.72824764251709, + "logits/rejected": -2.7456448078155518, + "logps/chosen": -190.12353515625, + "logps/rejected": -185.89035034179688, + "loss": 0.9381, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.16197335720062256, + "rewards/margins": 0.071574866771698, + "rewards/rejected": -0.23354823887348175, + "step": 3810 + }, + { + "epoch": 1.11, + "learning_rate": 3.9594904208013034e-07, + "logits/chosen": -2.7469112873077393, + "logits/rejected": -2.7380030155181885, + "logps/chosen": -205.6768035888672, + "logps/rejected": -195.28128051757812, + "loss": 0.9039, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.15377888083457947, + "rewards/margins": 0.11534550040960312, + "rewards/rejected": -0.2691243886947632, + "step": 3820 + }, + { + "epoch": 1.12, + "learning_rate": 3.952592074452914e-07, + "logits/chosen": -2.765177011489868, + "logits/rejected": -2.7546281814575195, + "logps/chosen": -202.8085479736328, + "logps/rejected": -195.9279022216797, + "loss": 0.9343, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.15616342425346375, + "rewards/margins": 0.08027738332748413, + "rewards/rejected": -0.23644080758094788, + "step": 3830 + }, + { + "epoch": 1.12, + "learning_rate": 3.9456769906213885e-07, + "logits/chosen": -2.7568039894104004, + "logits/rejected": -2.7418313026428223, + "logps/chosen": -215.3795166015625, + "logps/rejected": -184.53538513183594, + "loss": 0.9448, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.16710713505744934, + "rewards/margins": 0.04904966801404953, + "rewards/rejected": -0.21615679562091827, + "step": 3840 + }, + { + "epoch": 1.12, + "learning_rate": 3.9387452489857365e-07, + "logits/chosen": -2.716653347015381, + "logits/rejected": -2.7444241046905518, + "logps/chosen": -187.95553588867188, + "logps/rejected": -183.2508087158203, + "loss": 0.9209, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.15672233700752258, + "rewards/margins": 0.10151603072881699, + "rewards/rejected": -0.25823837518692017, + "step": 3850 + }, + { + "epoch": 1.13, + "learning_rate": 3.9317969294169086e-07, + "logits/chosen": -2.717005491256714, + "logits/rejected": -2.7423160076141357, + "logps/chosen": -189.92715454101562, + "logps/rejected": -203.66329956054688, + "loss": 0.9349, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16257938742637634, + "rewards/margins": 0.07710906118154526, + "rewards/rejected": -0.239688441157341, + "step": 3860 + }, + { + "epoch": 1.13, + "learning_rate": 3.9248321119768725e-07, + "logits/chosen": -2.756875991821289, + "logits/rejected": -2.7580325603485107, + "logps/chosen": -193.33822631835938, + "logps/rejected": -197.3529052734375, + "loss": 0.9374, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.17182958126068115, + "rewards/margins": 0.07164481282234192, + "rewards/rejected": -0.24347439408302307, + "step": 3870 + }, + { + "epoch": 1.13, + "learning_rate": 3.9178508769176954e-07, + "logits/chosen": -2.746933698654175, + "logits/rejected": -2.765470266342163, + "logps/chosen": -183.94923400878906, + "logps/rejected": -172.761962890625, + "loss": 0.9257, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.16591930389404297, + "rewards/margins": 0.08755029737949371, + "rewards/rejected": -0.2534696161746979, + "step": 3880 + }, + { + "epoch": 1.13, + "learning_rate": 3.9108533046806134e-07, + "logits/chosen": -2.714050769805908, + "logits/rejected": -2.7366297245025635, + "logps/chosen": -199.7872314453125, + "logps/rejected": -189.11221313476562, + "loss": 0.9329, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.18223315477371216, + "rewards/margins": 0.05786599591374397, + "rewards/rejected": -0.24009914696216583, + "step": 3890 + }, + { + "epoch": 1.14, + "learning_rate": 3.903839475895111e-07, + "logits/chosen": -2.7495663166046143, + "logits/rejected": -2.7438509464263916, + "logps/chosen": -196.3547821044922, + "logps/rejected": -184.18392944335938, + "loss": 0.9296, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.16668249666690826, + "rewards/margins": 0.06621585786342621, + "rewards/rejected": -0.23289835453033447, + "step": 3900 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -2.643688678741455, + "eval_logits/rejected": -2.638190984725952, + "eval_logps/chosen": -197.34109497070312, + "eval_logps/rejected": -183.32252502441406, + "eval_loss": 0.9405214786529541, + "eval_rewards/accuracies": 0.5713800191879272, + "eval_rewards/chosen": -0.17357788980007172, + "eval_rewards/margins": 0.06654638051986694, + "eval_rewards/rejected": -0.24012430012226105, + "eval_runtime": 443.4943, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 3900 + }, + { + "epoch": 1.14, + "learning_rate": 3.8968094713779847e-07, + "logits/chosen": -2.7617404460906982, + "logits/rejected": -2.75152325630188, + "logps/chosen": -200.19493103027344, + "logps/rejected": -186.37130737304688, + "loss": 0.9433, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2069055140018463, + "rewards/margins": 0.04876132681965828, + "rewards/rejected": -0.2556668221950531, + "step": 3910 + }, + { + "epoch": 1.14, + "learning_rate": 3.8897633721324185e-07, + "logits/chosen": -2.7647957801818848, + "logits/rejected": -2.7173893451690674, + "logps/chosen": -201.73190307617188, + "logps/rejected": -167.64065551757812, + "loss": 0.9488, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18705810606479645, + "rewards/margins": 0.06296879798173904, + "rewards/rejected": -0.2500268816947937, + "step": 3920 + }, + { + "epoch": 1.15, + "learning_rate": 3.882701259347047e-07, + "logits/chosen": -2.7746589183807373, + "logits/rejected": -2.7673373222351074, + "logps/chosen": -216.4488983154297, + "logps/rejected": -196.27919006347656, + "loss": 0.938, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.18840670585632324, + "rewards/margins": 0.08028068393468857, + "rewards/rejected": -0.2686874270439148, + "step": 3930 + }, + { + "epoch": 1.15, + "learning_rate": 3.8756232143950217e-07, + "logits/chosen": -2.764437198638916, + "logits/rejected": -2.770145893096924, + "logps/chosen": -193.11636352539062, + "logps/rejected": -191.73863220214844, + "loss": 0.923, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.18075844645500183, + "rewards/margins": 0.0936131626367569, + "rewards/rejected": -0.2743716239929199, + "step": 3940 + }, + { + "epoch": 1.15, + "learning_rate": 3.86852931883307e-07, + "logits/chosen": -2.751063823699951, + "logits/rejected": -2.739689588546753, + "logps/chosen": -176.7470245361328, + "logps/rejected": -161.86376953125, + "loss": 0.9265, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.18067319691181183, + "rewards/margins": 0.06665468215942383, + "rewards/rejected": -0.24732787907123566, + "step": 3950 + }, + { + "epoch": 1.16, + "learning_rate": 3.8614196544005614e-07, + "logits/chosen": -2.758460760116577, + "logits/rejected": -2.7591326236724854, + "logps/chosen": -198.73178100585938, + "logps/rejected": -187.89523315429688, + "loss": 0.9229, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.18694129586219788, + "rewards/margins": 0.06280256062746048, + "rewards/rejected": -0.24974386394023895, + "step": 3960 + }, + { + "epoch": 1.16, + "learning_rate": 3.854294303018558e-07, + "logits/chosen": -2.755293369293213, + "logits/rejected": -2.7594475746154785, + "logps/chosen": -192.7220916748047, + "logps/rejected": -184.11373901367188, + "loss": 0.9507, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.18343867361545563, + "rewards/margins": 0.06071305274963379, + "rewards/rejected": -0.2441517412662506, + "step": 3970 + }, + { + "epoch": 1.16, + "learning_rate": 3.8471533467888773e-07, + "logits/chosen": -2.7248222827911377, + "logits/rejected": -2.7303566932678223, + "logps/chosen": -194.8540802001953, + "logps/rejected": -191.58544921875, + "loss": 0.9664, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19519048929214478, + "rewards/margins": 0.044502776116132736, + "rewards/rejected": -0.23969325423240662, + "step": 3980 + }, + { + "epoch": 1.16, + "learning_rate": 3.8399968679931436e-07, + "logits/chosen": -2.778623342514038, + "logits/rejected": -2.7736964225769043, + "logps/chosen": -213.4280242919922, + "logps/rejected": -194.47512817382812, + "loss": 0.9344, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.17752891778945923, + "rewards/margins": 0.04777335748076439, + "rewards/rejected": -0.2253022938966751, + "step": 3990 + }, + { + "epoch": 1.17, + "learning_rate": 3.832824949091839e-07, + "logits/chosen": -2.7643864154815674, + "logits/rejected": -2.729543685913086, + "logps/chosen": -191.1431884765625, + "logps/rejected": -157.68287658691406, + "loss": 0.9338, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.18981656432151794, + "rewards/margins": 0.03325197845697403, + "rewards/rejected": -0.22306856513023376, + "step": 4000 + }, + { + "epoch": 1.17, + "eval_logits/chosen": -2.6482629776000977, + "eval_logits/rejected": -2.642835855484009, + "eval_logps/chosen": -197.35194396972656, + "eval_logps/rejected": -183.3475799560547, + "eval_loss": 0.9401752948760986, + "eval_rewards/accuracies": 0.5771583914756775, + "eval_rewards/chosen": -0.17466464638710022, + "eval_rewards/margins": 0.06796282529830933, + "eval_rewards/rejected": -0.24262748658657074, + "eval_runtime": 443.5283, + "eval_samples_per_second": 26.526, + "eval_steps_per_second": 3.317, + "step": 4000 + }, + { + "epoch": 1.17, + "learning_rate": 3.825637672723354e-07, + "logits/chosen": -2.7617149353027344, + "logits/rejected": -2.7304465770721436, + "logps/chosen": -219.8583984375, + "logps/rejected": -195.1961212158203, + "loss": 0.9498, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1955319046974182, + "rewards/margins": 0.05322521924972534, + "rewards/rejected": -0.24875712394714355, + "step": 4010 + }, + { + "epoch": 1.17, + "learning_rate": 3.818435121703036e-07, + "logits/chosen": -2.721818208694458, + "logits/rejected": -2.752952814102173, + "logps/chosen": -190.40536499023438, + "logps/rejected": -196.10472106933594, + "loss": 0.9467, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.1841999590396881, + "rewards/margins": 0.058970857411623, + "rewards/rejected": -0.24317078292369843, + "step": 4020 + }, + { + "epoch": 1.18, + "learning_rate": 3.8112173790222356e-07, + "logits/chosen": -2.7530345916748047, + "logits/rejected": -2.710888385772705, + "logps/chosen": -198.40542602539062, + "logps/rejected": -170.2858123779297, + "loss": 0.9357, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.19204849004745483, + "rewards/margins": 0.051998645067214966, + "rewards/rejected": -0.2440471202135086, + "step": 4030 + }, + { + "epoch": 1.18, + "learning_rate": 3.8039845278473467e-07, + "logits/chosen": -2.7375307083129883, + "logits/rejected": -2.7373642921447754, + "logps/chosen": -193.3035888671875, + "logps/rejected": -186.16928100585938, + "loss": 0.9375, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.17028315365314484, + "rewards/margins": 0.05886261910200119, + "rewards/rejected": -0.22914576530456543, + "step": 4040 + }, + { + "epoch": 1.18, + "learning_rate": 3.7967366515188515e-07, + "logits/chosen": -2.7566046714782715, + "logits/rejected": -2.7663302421569824, + "logps/chosen": -203.11294555664062, + "logps/rejected": -197.13296508789062, + "loss": 0.9227, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.18032750487327576, + "rewards/margins": 0.082804836332798, + "rewards/rejected": -0.26313233375549316, + "step": 4050 + }, + { + "epoch": 1.18, + "learning_rate": 3.7894738335503605e-07, + "logits/chosen": -2.718829870223999, + "logits/rejected": -2.7271435260772705, + "logps/chosen": -169.3847198486328, + "logps/rejected": -169.4076385498047, + "loss": 0.9259, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.19295862317085266, + "rewards/margins": 0.03824089840054512, + "rewards/rejected": -0.23119953274726868, + "step": 4060 + }, + { + "epoch": 1.19, + "learning_rate": 3.782196157627649e-07, + "logits/chosen": -2.7349534034729004, + "logits/rejected": -2.7287216186523438, + "logps/chosen": -188.2509765625, + "logps/rejected": -180.3507080078125, + "loss": 0.9225, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.1794901192188263, + "rewards/margins": 0.09128087759017944, + "rewards/rejected": -0.27077096700668335, + "step": 4070 + }, + { + "epoch": 1.19, + "learning_rate": 3.7749037076076915e-07, + "logits/chosen": -2.734947681427002, + "logits/rejected": -2.7134604454040527, + "logps/chosen": -188.677490234375, + "logps/rejected": -162.92825317382812, + "loss": 0.9366, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.17669777572155, + "rewards/margins": 0.058089107275009155, + "rewards/rejected": -0.23478689789772034, + "step": 4080 + }, + { + "epoch": 1.19, + "learning_rate": 3.767596567517698e-07, + "logits/chosen": -2.747828960418701, + "logits/rejected": -2.728135585784912, + "logps/chosen": -191.05796813964844, + "logps/rejected": -178.67286682128906, + "loss": 0.9393, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.1807515174150467, + "rewards/margins": 0.07468868046998978, + "rewards/rejected": -0.2554401755332947, + "step": 4090 + }, + { + "epoch": 1.2, + "learning_rate": 3.760274821554146e-07, + "logits/chosen": -2.733659029006958, + "logits/rejected": -2.734858274459839, + "logps/chosen": -205.01156616210938, + "logps/rejected": -184.3057403564453, + "loss": 0.9257, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.1808921992778778, + "rewards/margins": 0.07933205366134644, + "rewards/rejected": -0.26022425293922424, + "step": 4100 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -2.6465163230895996, + "eval_logits/rejected": -2.6410677433013916, + "eval_logps/chosen": -197.38494873046875, + "eval_logps/rejected": -183.38291931152344, + "eval_loss": 0.9395056962966919, + "eval_rewards/accuracies": 0.5766485333442688, + "eval_rewards/chosen": -0.17796368896961212, + "eval_rewards/margins": 0.06819843500852585, + "eval_rewards/rejected": -0.24616213142871857, + "eval_runtime": 443.4769, + "eval_samples_per_second": 26.529, + "eval_steps_per_second": 3.317, + "step": 4100 + }, + { + "epoch": 1.2, + "learning_rate": 3.752938554081806e-07, + "logits/chosen": -2.741283893585205, + "logits/rejected": -2.728320360183716, + "logps/chosen": -176.6365203857422, + "logps/rejected": -167.493896484375, + "loss": 0.9401, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.1969231367111206, + "rewards/margins": 0.06731526553630829, + "rewards/rejected": -0.2642384171485901, + "step": 4110 + }, + { + "epoch": 1.2, + "learning_rate": 3.7455878496327765e-07, + "logits/chosen": -2.7120602130889893, + "logits/rejected": -2.741365671157837, + "logps/chosen": -197.4280242919922, + "logps/rejected": -198.5906219482422, + "loss": 0.931, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.1776856780052185, + "rewards/margins": 0.07181225717067719, + "rewards/rejected": -0.2494979351758957, + "step": 4120 + }, + { + "epoch": 1.2, + "learning_rate": 3.738222792905501e-07, + "logits/chosen": -2.72918963432312, + "logits/rejected": -2.709934949874878, + "logps/chosen": -193.74960327148438, + "logps/rejected": -181.6873779296875, + "loss": 0.9355, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16433259844779968, + "rewards/margins": 0.09380577504634857, + "rewards/rejected": -0.25813838839530945, + "step": 4130 + }, + { + "epoch": 1.21, + "learning_rate": 3.7308434687638025e-07, + "logits/chosen": -2.7442197799682617, + "logits/rejected": -2.738088369369507, + "logps/chosen": -188.8663330078125, + "logps/rejected": -181.5855712890625, + "loss": 0.905, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.16837748885154724, + "rewards/margins": 0.08360324800014496, + "rewards/rejected": -0.251980721950531, + "step": 4140 + }, + { + "epoch": 1.21, + "learning_rate": 3.723449962235896e-07, + "logits/chosen": -2.7592225074768066, + "logits/rejected": -2.73795747756958, + "logps/chosen": -205.52249145507812, + "logps/rejected": -195.18670654296875, + "loss": 0.9521, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.21364426612854004, + "rewards/margins": 0.04619845002889633, + "rewards/rejected": -0.2598426938056946, + "step": 4150 + }, + { + "epoch": 1.21, + "learning_rate": 3.7160423585134146e-07, + "logits/chosen": -2.752201557159424, + "logits/rejected": -2.752375602722168, + "logps/chosen": -194.0412139892578, + "logps/rejected": -177.90927124023438, + "loss": 0.9319, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.17724746465682983, + "rewards/margins": 0.04986109584569931, + "rewards/rejected": -0.22710859775543213, + "step": 4160 + }, + { + "epoch": 1.22, + "learning_rate": 3.708620742950426e-07, + "logits/chosen": -2.7732951641082764, + "logits/rejected": -2.7519004344940186, + "logps/chosen": -212.4149627685547, + "logps/rejected": -189.8248748779297, + "loss": 0.9311, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.17742520570755005, + "rewards/margins": 0.06528773903846741, + "rewards/rejected": -0.24271297454833984, + "step": 4170 + }, + { + "epoch": 1.22, + "learning_rate": 3.70118520106245e-07, + "logits/chosen": -2.7384650707244873, + "logits/rejected": -2.7295033931732178, + "logps/chosen": -188.52340698242188, + "logps/rejected": -189.1070098876953, + "loss": 0.9338, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.16893449425697327, + "rewards/margins": 0.0758652538061142, + "rewards/rejected": -0.24479976296424866, + "step": 4180 + }, + { + "epoch": 1.22, + "learning_rate": 3.693735818525471e-07, + "logits/chosen": -2.7295570373535156, + "logits/rejected": -2.7191290855407715, + "logps/chosen": -190.3443145751953, + "logps/rejected": -174.7919158935547, + "loss": 0.9479, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.164515420794487, + "rewards/margins": 0.06707581877708435, + "rewards/rejected": -0.23159126937389374, + "step": 4190 + }, + { + "epoch": 1.23, + "learning_rate": 3.686272681174953e-07, + "logits/chosen": -2.746267795562744, + "logits/rejected": -2.7370259761810303, + "logps/chosen": -184.0223388671875, + "logps/rejected": -171.82571411132812, + "loss": 0.9368, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.18105138838291168, + "rewards/margins": 0.06937004625797272, + "rewards/rejected": -0.2504214644432068, + "step": 4200 + }, + { + "epoch": 1.23, + "eval_logits/chosen": -2.6547842025756836, + "eval_logits/rejected": -2.6495087146759033, + "eval_logps/chosen": -197.39144897460938, + "eval_logps/rejected": -183.40631103515625, + "eval_loss": 0.9386326670646667, + "eval_rewards/accuracies": 0.5832766890525818, + "eval_rewards/chosen": -0.17861297726631165, + "eval_rewards/margins": 0.06988853961229324, + "eval_rewards/rejected": -0.2485015094280243, + "eval_runtime": 443.4997, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 4200 + }, + { + "epoch": 1.23, + "learning_rate": 3.6787958750048505e-07, + "logits/chosen": -2.752012252807617, + "logits/rejected": -2.7526049613952637, + "logps/chosen": -192.90499877929688, + "logps/rejected": -189.8139190673828, + "loss": 0.9222, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.1890353262424469, + "rewards/margins": 0.0668584331870079, + "rewards/rejected": -0.2558937668800354, + "step": 4210 + }, + { + "epoch": 1.23, + "learning_rate": 3.671305486166615e-07, + "logits/chosen": -2.7432100772857666, + "logits/rejected": -2.7489893436431885, + "logps/chosen": -195.6559295654297, + "logps/rejected": -184.36268615722656, + "loss": 0.9649, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.22122982144355774, + "rewards/margins": 0.031619004905223846, + "rewards/rejected": -0.25284886360168457, + "step": 4220 + }, + { + "epoch": 1.23, + "learning_rate": 3.663801600968206e-07, + "logits/chosen": -2.720708131790161, + "logits/rejected": -2.7209668159484863, + "logps/chosen": -196.4862518310547, + "logps/rejected": -171.839111328125, + "loss": 0.9449, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.1937265694141388, + "rewards/margins": 0.04329920932650566, + "rewards/rejected": -0.23702581226825714, + "step": 4230 + }, + { + "epoch": 1.24, + "learning_rate": 3.656284305873093e-07, + "logits/chosen": -2.712952136993408, + "logits/rejected": -2.742710590362549, + "logps/chosen": -195.47030639648438, + "logps/rejected": -192.92300415039062, + "loss": 0.9188, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1770908236503601, + "rewards/margins": 0.08894769847393036, + "rewards/rejected": -0.2660385072231293, + "step": 4240 + }, + { + "epoch": 1.24, + "learning_rate": 3.6487536874992634e-07, + "logits/chosen": -2.7471885681152344, + "logits/rejected": -2.7262566089630127, + "logps/chosen": -197.21139526367188, + "logps/rejected": -179.6346893310547, + "loss": 0.9263, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1807824820280075, + "rewards/margins": 0.06291843205690384, + "rewards/rejected": -0.24370090663433075, + "step": 4250 + }, + { + "epoch": 1.24, + "learning_rate": 3.6412098326182193e-07, + "logits/chosen": -2.7384915351867676, + "logits/rejected": -2.759437084197998, + "logps/chosen": -208.9658203125, + "logps/rejected": -195.96910095214844, + "loss": 0.9389, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19169051945209503, + "rewards/margins": 0.06298931688070297, + "rewards/rejected": -0.2546798288822174, + "step": 4260 + }, + { + "epoch": 1.25, + "learning_rate": 3.633652828153982e-07, + "logits/chosen": -2.758999824523926, + "logits/rejected": -2.7437479496002197, + "logps/chosen": -196.12710571289062, + "logps/rejected": -175.85052490234375, + "loss": 0.9342, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.20808005332946777, + "rewards/margins": 0.05318453162908554, + "rewards/rejected": -0.2612645626068115, + "step": 4270 + }, + { + "epoch": 1.25, + "learning_rate": 3.626082761182089e-07, + "logits/chosen": -2.738142490386963, + "logits/rejected": -2.7285945415496826, + "logps/chosen": -195.19175720214844, + "logps/rejected": -186.00698852539062, + "loss": 0.9236, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.16431716084480286, + "rewards/margins": 0.0769704058766365, + "rewards/rejected": -0.24128755927085876, + "step": 4280 + }, + { + "epoch": 1.25, + "learning_rate": 3.6184997189285883e-07, + "logits/chosen": -2.7633702754974365, + "logits/rejected": -2.77644419670105, + "logps/chosen": -221.3899688720703, + "logps/rejected": -203.3837432861328, + "loss": 0.9114, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.17412501573562622, + "rewards/margins": 0.11658620834350586, + "rewards/rejected": -0.2907112240791321, + "step": 4290 + }, + { + "epoch": 1.25, + "learning_rate": 3.610903788769039e-07, + "logits/chosen": -2.723555088043213, + "logits/rejected": -2.7423713207244873, + "logps/chosen": -177.48287963867188, + "logps/rejected": -188.9573974609375, + "loss": 0.916, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.17435404658317566, + "rewards/margins": 0.1128971129655838, + "rewards/rejected": -0.28725117444992065, + "step": 4300 + }, + { + "epoch": 1.25, + "eval_logits/chosen": -2.6445152759552, + "eval_logits/rejected": -2.6390397548675537, + "eval_logps/chosen": -197.41690063476562, + "eval_logps/rejected": -183.43453979492188, + "eval_loss": 0.9384915828704834, + "eval_rewards/accuracies": 0.5763086080551147, + "eval_rewards/chosen": -0.18115966022014618, + "eval_rewards/margins": 0.07016538083553314, + "eval_rewards/rejected": -0.2513250708580017, + "eval_runtime": 443.5422, + "eval_samples_per_second": 26.525, + "eval_steps_per_second": 3.316, + "step": 4300 + }, + { + "epoch": 1.26, + "learning_rate": 3.603295058227498e-07, + "logits/chosen": -2.7326014041900635, + "logits/rejected": -2.713841438293457, + "logps/chosen": -198.6339874267578, + "logps/rejected": -174.13699340820312, + "loss": 0.935, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.19719673693180084, + "rewards/margins": 0.0652797743678093, + "rewards/rejected": -0.26247650384902954, + "step": 4310 + }, + { + "epoch": 1.26, + "learning_rate": 3.5956736149755165e-07, + "logits/chosen": -2.677794933319092, + "logits/rejected": -2.6976592540740967, + "logps/chosen": -189.23277282714844, + "logps/rejected": -194.13796997070312, + "loss": 0.9088, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1718340814113617, + "rewards/margins": 0.0862637609243393, + "rewards/rejected": -0.2580978274345398, + "step": 4320 + }, + { + "epoch": 1.26, + "learning_rate": 3.588039546831125e-07, + "logits/chosen": -2.7398147583007812, + "logits/rejected": -2.780071258544922, + "logps/chosen": -204.4764404296875, + "logps/rejected": -211.4398956298828, + "loss": 0.9095, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.19177083671092987, + "rewards/margins": 0.09926985204219818, + "rewards/rejected": -0.29104071855545044, + "step": 4330 + }, + { + "epoch": 1.27, + "learning_rate": 3.580392941757828e-07, + "logits/chosen": -2.7215991020202637, + "logits/rejected": -2.7327351570129395, + "logps/chosen": -199.95437622070312, + "logps/rejected": -191.2192840576172, + "loss": 0.9253, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.17956005036830902, + "rewards/margins": 0.09569841623306274, + "rewards/rejected": -0.27525845170021057, + "step": 4340 + }, + { + "epoch": 1.27, + "learning_rate": 3.5727338878635837e-07, + "logits/chosen": -2.71069598197937, + "logits/rejected": -2.736706018447876, + "logps/chosen": -192.46652221679688, + "logps/rejected": -198.8728790283203, + "loss": 0.925, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.17255479097366333, + "rewards/margins": 0.05931825190782547, + "rewards/rejected": -0.2318730354309082, + "step": 4350 + }, + { + "epoch": 1.27, + "learning_rate": 3.5650624733997944e-07, + "logits/chosen": -2.746736526489258, + "logits/rejected": -2.7394838333129883, + "logps/chosen": -189.6582794189453, + "logps/rejected": -163.8761749267578, + "loss": 0.9395, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.16428257524967194, + "rewards/margins": 0.0878874734044075, + "rewards/rejected": -0.25217002630233765, + "step": 4360 + }, + { + "epoch": 1.27, + "learning_rate": 3.5573787867602834e-07, + "logits/chosen": -2.731800079345703, + "logits/rejected": -2.7399964332580566, + "logps/chosen": -184.02911376953125, + "logps/rejected": -175.05958557128906, + "loss": 0.9325, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.21719913184642792, + "rewards/margins": 0.06476739048957825, + "rewards/rejected": -0.28196650743484497, + "step": 4370 + }, + { + "epoch": 1.28, + "learning_rate": 3.5496829164802844e-07, + "logits/chosen": -2.7151689529418945, + "logits/rejected": -2.7390894889831543, + "logps/chosen": -179.9576873779297, + "logps/rejected": -177.39881896972656, + "loss": 0.9121, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.18142959475517273, + "rewards/margins": 0.09553618729114532, + "rewards/rejected": -0.27696579694747925, + "step": 4380 + }, + { + "epoch": 1.28, + "learning_rate": 3.5419749512354134e-07, + "logits/chosen": -2.712517738342285, + "logits/rejected": -2.7266857624053955, + "logps/chosen": -174.7775421142578, + "logps/rejected": -166.7398223876953, + "loss": 0.9256, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.18136021494865417, + "rewards/margins": 0.06892851740121841, + "rewards/rejected": -0.2502886950969696, + "step": 4390 + }, + { + "epoch": 1.28, + "learning_rate": 3.534254979840653e-07, + "logits/chosen": -2.7191343307495117, + "logits/rejected": -2.7256722450256348, + "logps/chosen": -185.9070587158203, + "logps/rejected": -176.5060272216797, + "loss": 0.9093, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.18107880651950836, + "rewards/margins": 0.07841619849205017, + "rewards/rejected": -0.25949499011039734, + "step": 4400 + }, + { + "epoch": 1.28, + "eval_logits/chosen": -2.6502106189727783, + "eval_logits/rejected": -2.6448378562927246, + "eval_logps/chosen": -197.46881103515625, + "eval_logps/rejected": -183.49722290039062, + "eval_loss": 0.9375219941139221, + "eval_rewards/accuracies": 0.5831067562103271, + "eval_rewards/chosen": -0.1863512098789215, + "eval_rewards/margins": 0.07124443352222443, + "eval_rewards/rejected": -0.25759562849998474, + "eval_runtime": 443.5197, + "eval_samples_per_second": 26.526, + "eval_steps_per_second": 3.317, + "step": 4400 + }, + { + "epoch": 1.29, + "learning_rate": 3.526523091249324e-07, + "logits/chosen": -2.7146754264831543, + "logits/rejected": -2.723759889602661, + "logps/chosen": -168.30792236328125, + "logps/rejected": -174.40383911132812, + "loss": 0.9294, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.1705663502216339, + "rewards/margins": 0.08236994594335556, + "rewards/rejected": -0.25293630361557007, + "step": 4410 + }, + { + "epoch": 1.29, + "learning_rate": 3.518779374552066e-07, + "logits/chosen": -2.7407498359680176, + "logits/rejected": -2.737842082977295, + "logps/chosen": -195.39694213867188, + "logps/rejected": -192.8684539794922, + "loss": 0.9102, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.2054818570613861, + "rewards/margins": 0.10714855045080185, + "rewards/rejected": -0.31263041496276855, + "step": 4420 + }, + { + "epoch": 1.29, + "learning_rate": 3.511023918975806e-07, + "logits/chosen": -2.738293409347534, + "logits/rejected": -2.719567060470581, + "logps/chosen": -200.95350646972656, + "logps/rejected": -166.63438415527344, + "loss": 0.9335, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1738850325345993, + "rewards/margins": 0.07271697372198105, + "rewards/rejected": -0.24660198390483856, + "step": 4430 + }, + { + "epoch": 1.3, + "learning_rate": 3.5032568138827317e-07, + "logits/chosen": -2.703068256378174, + "logits/rejected": -2.6935973167419434, + "logps/chosen": -173.39498901367188, + "logps/rejected": -169.9501190185547, + "loss": 0.9397, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.17545907199382782, + "rewards/margins": 0.06307219713926315, + "rewards/rejected": -0.23853127658367157, + "step": 4440 + }, + { + "epoch": 1.3, + "learning_rate": 3.4954781487692645e-07, + "logits/chosen": -2.705918550491333, + "logits/rejected": -2.718723773956299, + "logps/chosen": -175.64553833007812, + "logps/rejected": -183.96029663085938, + "loss": 0.9333, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.18060100078582764, + "rewards/margins": 0.08112530410289764, + "rewards/rejected": -0.2617262899875641, + "step": 4450 + }, + { + "epoch": 1.3, + "learning_rate": 3.487688013265024e-07, + "logits/chosen": -2.717538356781006, + "logits/rejected": -2.711728572845459, + "logps/chosen": -190.5551300048828, + "logps/rejected": -174.3909149169922, + "loss": 0.9367, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17379295825958252, + "rewards/margins": 0.06317581981420517, + "rewards/rejected": -0.2369687855243683, + "step": 4460 + }, + { + "epoch": 1.3, + "learning_rate": 3.479886497131799e-07, + "logits/chosen": -2.7508199214935303, + "logits/rejected": -2.751966953277588, + "logps/chosen": -189.25047302246094, + "logps/rejected": -190.21163940429688, + "loss": 0.9321, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.20376893877983093, + "rewards/margins": 0.07644443213939667, + "rewards/rejected": -0.2802133560180664, + "step": 4470 + }, + { + "epoch": 1.31, + "learning_rate": 3.472073690262509e-07, + "logits/chosen": -2.7412967681884766, + "logits/rejected": -2.747689723968506, + "logps/chosen": -190.6803741455078, + "logps/rejected": -186.3988494873047, + "loss": 0.9372, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.20845802128314972, + "rewards/margins": 0.05214614421129227, + "rewards/rejected": -0.2606041729450226, + "step": 4480 + }, + { + "epoch": 1.31, + "learning_rate": 3.464249682680174e-07, + "logits/chosen": -2.7326245307922363, + "logits/rejected": -2.7284915447235107, + "logps/chosen": -201.30905151367188, + "logps/rejected": -185.57102966308594, + "loss": 0.9209, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.17261795699596405, + "rewards/margins": 0.08153346180915833, + "rewards/rejected": -0.2541514039039612, + "step": 4490 + }, + { + "epoch": 1.31, + "learning_rate": 3.4564145645368726e-07, + "logits/chosen": -2.745229721069336, + "logits/rejected": -2.7422919273376465, + "logps/chosen": -202.03028869628906, + "logps/rejected": -189.1793212890625, + "loss": 0.9408, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.2132134884595871, + "rewards/margins": 0.06484152376651764, + "rewards/rejected": -0.27805501222610474, + "step": 4500 + }, + { + "epoch": 1.31, + "eval_logits/chosen": -2.6475512981414795, + "eval_logits/rejected": -2.6421573162078857, + "eval_logps/chosen": -197.50157165527344, + "eval_logps/rejected": -183.53640747070312, + "eval_loss": 0.9368076324462891, + "eval_rewards/accuracies": 0.579707682132721, + "eval_rewards/chosen": -0.18962688744068146, + "eval_rewards/margins": 0.0718846246600151, + "eval_rewards/rejected": -0.26151153445243835, + "eval_runtime": 443.5492, + "eval_samples_per_second": 26.525, + "eval_steps_per_second": 3.316, + "step": 4500 + }, + { + "epoch": 1.32, + "learning_rate": 3.448568426112703e-07, + "logits/chosen": -2.7658443450927734, + "logits/rejected": -2.7624783515930176, + "logps/chosen": -219.56533813476562, + "logps/rejected": -200.84490966796875, + "loss": 0.9374, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.20336861908435822, + "rewards/margins": 0.058419667184352875, + "rewards/rejected": -0.2617882788181305, + "step": 4510 + }, + { + "epoch": 1.32, + "learning_rate": 3.4407113578147484e-07, + "logits/chosen": -2.710071325302124, + "logits/rejected": -2.7460498809814453, + "logps/chosen": -192.58006286621094, + "logps/rejected": -201.64654541015625, + "loss": 0.9309, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.18946990370750427, + "rewards/margins": 0.07266337424516678, + "rewards/rejected": -0.26213327050209045, + "step": 4520 + }, + { + "epoch": 1.32, + "learning_rate": 3.4328434501760285e-07, + "logits/chosen": -2.7125916481018066, + "logits/rejected": -2.717337131500244, + "logps/chosen": -200.4038848876953, + "logps/rejected": -182.41598510742188, + "loss": 0.9249, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21740098297595978, + "rewards/margins": 0.08536939322948456, + "rewards/rejected": -0.30277037620544434, + "step": 4530 + }, + { + "epoch": 1.32, + "learning_rate": 3.4249647938544604e-07, + "logits/chosen": -2.742849349975586, + "logits/rejected": -2.7103047370910645, + "logps/chosen": -201.5478057861328, + "logps/rejected": -189.70504760742188, + "loss": 0.9279, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.1888127475976944, + "rewards/margins": 0.0719815269112587, + "rewards/rejected": -0.2607942521572113, + "step": 4540 + }, + { + "epoch": 1.33, + "learning_rate": 3.417075479631812e-07, + "logits/chosen": -2.7691056728363037, + "logits/rejected": -2.7712225914001465, + "logps/chosen": -187.98654174804688, + "logps/rejected": -191.6451873779297, + "loss": 0.9276, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2066299170255661, + "rewards/margins": 0.08311845362186432, + "rewards/rejected": -0.2897483706474304, + "step": 4550 + }, + { + "epoch": 1.33, + "learning_rate": 3.409175598412658e-07, + "logits/chosen": -2.722172737121582, + "logits/rejected": -2.7517924308776855, + "logps/chosen": -192.55972290039062, + "logps/rejected": -193.70285034179688, + "loss": 0.9166, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.19656968116760254, + "rewards/margins": 0.08301197737455368, + "rewards/rejected": -0.27958163619041443, + "step": 4560 + }, + { + "epoch": 1.33, + "learning_rate": 3.40126524122333e-07, + "logits/chosen": -2.7423648834228516, + "logits/rejected": -2.7352731227874756, + "logps/chosen": -193.5330352783203, + "logps/rejected": -192.60610961914062, + "loss": 0.9136, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.18352532386779785, + "rewards/margins": 0.08987872302532196, + "rewards/rejected": -0.273404061794281, + "step": 4570 + }, + { + "epoch": 1.34, + "learning_rate": 3.3933444992108703e-07, + "logits/chosen": -2.7353925704956055, + "logits/rejected": -2.701998472213745, + "logps/chosen": -204.79664611816406, + "logps/rejected": -169.64654541015625, + "loss": 0.9283, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.18033994734287262, + "rewards/margins": 0.06377788633108139, + "rewards/rejected": -0.24411781132221222, + "step": 4580 + }, + { + "epoch": 1.34, + "learning_rate": 3.3854134636419783e-07, + "logits/chosen": -2.70586895942688, + "logits/rejected": -2.7365779876708984, + "logps/chosen": -196.3794708251953, + "logps/rejected": -186.25599670410156, + "loss": 0.9169, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.19063739478588104, + "rewards/margins": 0.07079647481441498, + "rewards/rejected": -0.26143383979797363, + "step": 4590 + }, + { + "epoch": 1.34, + "learning_rate": 3.377472225901963e-07, + "logits/chosen": -2.7350027561187744, + "logits/rejected": -2.7467072010040283, + "logps/chosen": -196.7301788330078, + "logps/rejected": -190.94921875, + "loss": 0.9245, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2064705789089203, + "rewards/margins": 0.0866440087556839, + "rewards/rejected": -0.293114572763443, + "step": 4600 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -2.661395311355591, + "eval_logits/rejected": -2.656251907348633, + "eval_logps/chosen": -197.5314483642578, + "eval_logps/rejected": -183.58152770996094, + "eval_loss": 0.9362921118736267, + "eval_rewards/accuracies": 0.5786879658699036, + "eval_rewards/chosen": -0.19261135160923004, + "eval_rewards/margins": 0.0734131708741188, + "eval_rewards/rejected": -0.26602452993392944, + "eval_runtime": 443.5217, + "eval_samples_per_second": 26.526, + "eval_steps_per_second": 3.317, + "step": 4600 + }, + { + "epoch": 1.34, + "learning_rate": 3.3695208774936863e-07, + "logits/chosen": -2.7428646087646484, + "logits/rejected": -2.7751758098602295, + "logps/chosen": -176.3280029296875, + "logps/rejected": -185.43893432617188, + "loss": 0.9284, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.17020182311534882, + "rewards/margins": 0.10501272976398468, + "rewards/rejected": -0.2752145528793335, + "step": 4610 + }, + { + "epoch": 1.35, + "learning_rate": 3.36155951003651e-07, + "logits/chosen": -2.7163808345794678, + "logits/rejected": -2.720247745513916, + "logps/chosen": -197.26736450195312, + "logps/rejected": -181.1373291015625, + "loss": 0.933, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1962851732969284, + "rewards/margins": 0.06428460776805878, + "rewards/rejected": -0.2605697810649872, + "step": 4620 + }, + { + "epoch": 1.35, + "learning_rate": 3.353588215265243e-07, + "logits/chosen": -2.741788864135742, + "logits/rejected": -2.771073818206787, + "logps/chosen": -203.08348083496094, + "logps/rejected": -204.08729553222656, + "loss": 0.9208, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.20287474989891052, + "rewards/margins": 0.10334237664937973, + "rewards/rejected": -0.30621713399887085, + "step": 4630 + }, + { + "epoch": 1.35, + "learning_rate": 3.3456070850290773e-07, + "logits/chosen": -2.7344655990600586, + "logits/rejected": -2.74898624420166, + "logps/chosen": -181.22328186035156, + "logps/rejected": -167.53872680664062, + "loss": 0.9273, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1856803596019745, + "rewards/margins": 0.07511058449745178, + "rewards/rejected": -0.26079094409942627, + "step": 4640 + }, + { + "epoch": 1.36, + "learning_rate": 3.337616211290539e-07, + "logits/chosen": -2.735715389251709, + "logits/rejected": -2.7299177646636963, + "logps/chosen": -193.27822875976562, + "logps/rejected": -179.62449645996094, + "loss": 0.9184, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16103467345237732, + "rewards/margins": 0.08624567091464996, + "rewards/rejected": -0.2472803294658661, + "step": 4650 + }, + { + "epoch": 1.36, + "learning_rate": 3.329615686124419e-07, + "logits/chosen": -2.7415900230407715, + "logits/rejected": -2.7193005084991455, + "logps/chosen": -193.71746826171875, + "logps/rejected": -173.92623901367188, + "loss": 0.9443, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.23755891621112823, + "rewards/margins": 0.03554988652467728, + "rewards/rejected": -0.2731088101863861, + "step": 4660 + }, + { + "epoch": 1.36, + "learning_rate": 3.321605601716719e-07, + "logits/chosen": -2.7361927032470703, + "logits/rejected": -2.7571628093719482, + "logps/chosen": -230.0321044921875, + "logps/rejected": -213.3992156982422, + "loss": 0.919, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20671303570270538, + "rewards/margins": 0.07165684551000595, + "rewards/rejected": -0.2783699035644531, + "step": 4670 + }, + { + "epoch": 1.37, + "learning_rate": 3.313586050363589e-07, + "logits/chosen": -2.7228760719299316, + "logits/rejected": -2.7285807132720947, + "logps/chosen": -178.76773071289062, + "logps/rejected": -175.6080322265625, + "loss": 0.9404, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.18862402439117432, + "rewards/margins": 0.0594087652862072, + "rewards/rejected": -0.248032808303833, + "step": 4680 + }, + { + "epoch": 1.37, + "learning_rate": 3.305557124470256e-07, + "logits/chosen": -2.7296719551086426, + "logits/rejected": -2.703669786453247, + "logps/chosen": -177.8763427734375, + "logps/rejected": -169.86013793945312, + "loss": 0.9394, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.19478978216648102, + "rewards/margins": 0.05060974508523941, + "rewards/rejected": -0.24539950489997864, + "step": 4690 + }, + { + "epoch": 1.37, + "learning_rate": 3.2975189165499705e-07, + "logits/chosen": -2.7460432052612305, + "logits/rejected": -2.742731809616089, + "logps/chosen": -193.16432189941406, + "logps/rejected": -172.71707153320312, + "loss": 0.9469, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.23428598046302795, + "rewards/margins": 0.02087447978556156, + "rewards/rejected": -0.2551604211330414, + "step": 4700 + }, + { + "epoch": 1.37, + "eval_logits/chosen": -2.663195848464966, + "eval_logits/rejected": -2.658090591430664, + "eval_logps/chosen": -197.5492706298828, + "eval_logps/rejected": -183.58750915527344, + "eval_loss": 0.9363651275634766, + "eval_rewards/accuracies": 0.5774983167648315, + "eval_rewards/chosen": -0.19439838826656342, + "eval_rewards/margins": 0.0722242221236229, + "eval_rewards/rejected": -0.2666226327419281, + "eval_runtime": 443.5861, + "eval_samples_per_second": 26.522, + "eval_steps_per_second": 3.316, + "step": 4700 + }, + { + "epoch": 1.37, + "learning_rate": 3.2894715192229334e-07, + "logits/chosen": -2.7303051948547363, + "logits/rejected": -2.715846538543701, + "logps/chosen": -185.22119140625, + "logps/rejected": -162.5220947265625, + "loss": 0.9338, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.19038231670856476, + "rewards/margins": 0.06438779830932617, + "rewards/rejected": -0.25477010011672974, + "step": 4710 + }, + { + "epoch": 1.38, + "learning_rate": 3.2814150252152297e-07, + "logits/chosen": -2.7478604316711426, + "logits/rejected": -2.7180073261260986, + "logps/chosen": -212.9084014892578, + "logps/rejected": -183.78271484375, + "loss": 0.9501, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.22500388324260712, + "rewards/margins": 0.048243068158626556, + "rewards/rejected": -0.2732469439506531, + "step": 4720 + }, + { + "epoch": 1.38, + "learning_rate": 3.273349527357761e-07, + "logits/chosen": -2.721357822418213, + "logits/rejected": -2.749497652053833, + "logps/chosen": -183.78855895996094, + "logps/rejected": -177.4352569580078, + "loss": 0.9265, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.19720439612865448, + "rewards/margins": 0.0935719683766365, + "rewards/rejected": -0.2907763719558716, + "step": 4730 + }, + { + "epoch": 1.38, + "learning_rate": 3.265275118585178e-07, + "logits/chosen": -2.7387373447418213, + "logits/rejected": -2.7455060482025146, + "logps/chosen": -188.77359008789062, + "logps/rejected": -185.95318603515625, + "loss": 0.92, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.21058376133441925, + "rewards/margins": 0.10225590318441391, + "rewards/rejected": -0.31283968687057495, + "step": 4740 + }, + { + "epoch": 1.39, + "learning_rate": 3.257191891934804e-07, + "logits/chosen": -2.7524802684783936, + "logits/rejected": -2.738579034805298, + "logps/chosen": -195.68280029296875, + "logps/rejected": -172.1834716796875, + "loss": 0.9377, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.20516614615917206, + "rewards/margins": 0.06340218335390091, + "rewards/rejected": -0.26856836676597595, + "step": 4750 + }, + { + "epoch": 1.39, + "learning_rate": 3.2490999405455675e-07, + "logits/chosen": -2.7448105812072754, + "logits/rejected": -2.7162296772003174, + "logps/chosen": -224.0458526611328, + "logps/rejected": -184.15225219726562, + "loss": 0.9251, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18894138932228088, + "rewards/margins": 0.0902334600687027, + "rewards/rejected": -0.2791748642921448, + "step": 4760 + }, + { + "epoch": 1.39, + "learning_rate": 3.24099935765693e-07, + "logits/chosen": -2.7416248321533203, + "logits/rejected": -2.7578914165496826, + "logps/chosen": -175.80899047851562, + "logps/rejected": -186.74563598632812, + "loss": 0.9348, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2133389413356781, + "rewards/margins": 0.0710340365767479, + "rewards/rejected": -0.2843729853630066, + "step": 4770 + }, + { + "epoch": 1.39, + "learning_rate": 3.2328902366078055e-07, + "logits/chosen": -2.7463059425354004, + "logits/rejected": -2.7555289268493652, + "logps/chosen": -216.08584594726562, + "logps/rejected": -196.11654663085938, + "loss": 0.9453, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.19555380940437317, + "rewards/margins": 0.0949058011174202, + "rewards/rejected": -0.29045960307121277, + "step": 4780 + }, + { + "epoch": 1.4, + "learning_rate": 3.2247726708354916e-07, + "logits/chosen": -2.7026150226593018, + "logits/rejected": -2.727374315261841, + "logps/chosen": -188.6559295654297, + "logps/rejected": -184.67588806152344, + "loss": 0.9123, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.17864663898944855, + "rewards/margins": 0.08552898466587067, + "rewards/rejected": -0.2641756236553192, + "step": 4790 + }, + { + "epoch": 1.4, + "learning_rate": 3.216646753874589e-07, + "logits/chosen": -2.7416110038757324, + "logits/rejected": -2.7252979278564453, + "logps/chosen": -200.88369750976562, + "logps/rejected": -183.188232421875, + "loss": 0.9421, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.21473363041877747, + "rewards/margins": 0.048976875841617584, + "rewards/rejected": -0.26371049880981445, + "step": 4800 + }, + { + "epoch": 1.4, + "eval_logits/chosen": -2.669057607650757, + "eval_logits/rejected": -2.6640470027923584, + "eval_logps/chosen": -197.55172729492188, + "eval_logps/rejected": -183.60397338867188, + "eval_loss": 0.9358345866203308, + "eval_rewards/accuracies": 0.5819170475006104, + "eval_rewards/chosen": -0.19464226067066193, + "eval_rewards/margins": 0.07362484186887741, + "eval_rewards/rejected": -0.26826706528663635, + "eval_runtime": 443.5515, + "eval_samples_per_second": 26.525, + "eval_steps_per_second": 3.316, + "step": 4800 + }, + { + "epoch": 1.4, + "learning_rate": 3.208512579355925e-07, + "logits/chosen": -2.757784366607666, + "logits/rejected": -2.7481346130371094, + "logps/chosen": -201.49862670898438, + "logps/rejected": -203.2624969482422, + "loss": 0.9325, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19091400504112244, + "rewards/margins": 0.06763036549091339, + "rewards/rejected": -0.258544385433197, + "step": 4810 + }, + { + "epoch": 1.41, + "learning_rate": 3.200370241005476e-07, + "logits/chosen": -2.7480101585388184, + "logits/rejected": -2.723578929901123, + "logps/chosen": -202.5946502685547, + "logps/rejected": -169.34739685058594, + "loss": 0.9091, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.18458306789398193, + "rewards/margins": 0.07845629006624222, + "rewards/rejected": -0.26303935050964355, + "step": 4820 + }, + { + "epoch": 1.41, + "learning_rate": 3.1922198326432835e-07, + "logits/chosen": -2.7490594387054443, + "logits/rejected": -2.738849639892578, + "logps/chosen": -187.2062530517578, + "logps/rejected": -175.1187286376953, + "loss": 0.9332, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.19218167662620544, + "rewards/margins": 0.08368309587240219, + "rewards/rejected": -0.27586477994918823, + "step": 4830 + }, + { + "epoch": 1.41, + "learning_rate": 3.184061448182379e-07, + "logits/chosen": -2.7155652046203613, + "logits/rejected": -2.6949169635772705, + "logps/chosen": -202.61073303222656, + "logps/rejected": -180.4763946533203, + "loss": 0.9499, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.19347918033599854, + "rewards/margins": 0.04405388981103897, + "rewards/rejected": -0.2375330626964569, + "step": 4840 + }, + { + "epoch": 1.41, + "learning_rate": 3.175895181627695e-07, + "logits/chosen": -2.7712833881378174, + "logits/rejected": -2.7551021575927734, + "logps/chosen": -192.11685180664062, + "logps/rejected": -185.52256774902344, + "loss": 0.9448, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.2322724163532257, + "rewards/margins": 0.05692540481686592, + "rewards/rejected": -0.28919777274131775, + "step": 4850 + }, + { + "epoch": 1.42, + "learning_rate": 3.167721127074988e-07, + "logits/chosen": -2.754729986190796, + "logits/rejected": -2.7834737300872803, + "logps/chosen": -183.2649383544922, + "logps/rejected": -194.07073974609375, + "loss": 0.9309, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2212039977312088, + "rewards/margins": 0.05648614838719368, + "rewards/rejected": -0.2776901423931122, + "step": 4860 + }, + { + "epoch": 1.42, + "learning_rate": 3.15953937870975e-07, + "logits/chosen": -2.7316761016845703, + "logits/rejected": -2.7409138679504395, + "logps/chosen": -185.97994995117188, + "logps/rejected": -187.85910034179688, + "loss": 0.9352, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.19891253113746643, + "rewards/margins": 0.0585172101855278, + "rewards/rejected": -0.25742974877357483, + "step": 4870 + }, + { + "epoch": 1.42, + "learning_rate": 3.1513500308061264e-07, + "logits/chosen": -2.748883008956909, + "logits/rejected": -2.7345004081726074, + "logps/chosen": -204.35073852539062, + "logps/rejected": -189.57794189453125, + "loss": 0.9219, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.19797687232494354, + "rewards/margins": 0.07122127711772919, + "rewards/rejected": -0.26919814944267273, + "step": 4880 + }, + { + "epoch": 1.43, + "learning_rate": 3.1431531777258265e-07, + "logits/chosen": -2.7522618770599365, + "logits/rejected": -2.7130308151245117, + "logps/chosen": -183.6536102294922, + "logps/rejected": -157.83010864257812, + "loss": 0.9459, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.19545993208885193, + "rewards/margins": 0.053138844668865204, + "rewards/rejected": -0.24859876930713654, + "step": 4890 + }, + { + "epoch": 1.43, + "learning_rate": 3.134948913917039e-07, + "logits/chosen": -2.717576026916504, + "logits/rejected": -2.730308771133423, + "logps/chosen": -185.91043090820312, + "logps/rejected": -183.8404998779297, + "loss": 0.9076, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1804662048816681, + "rewards/margins": 0.09477277100086212, + "rewards/rejected": -0.2752389907836914, + "step": 4900 + }, + { + "epoch": 1.43, + "eval_logits/chosen": -2.667618751525879, + "eval_logits/rejected": -2.662586212158203, + "eval_logps/chosen": -197.56797790527344, + "eval_logps/rejected": -183.62533569335938, + "eval_loss": 0.9355936646461487, + "eval_rewards/accuracies": 0.5798776149749756, + "eval_rewards/chosen": -0.19626484811306, + "eval_rewards/margins": 0.07414159178733826, + "eval_rewards/rejected": -0.27040642499923706, + "eval_runtime": 443.5025, + "eval_samples_per_second": 26.527, + "eval_steps_per_second": 3.317, + "step": 4900 + }, + { + "epoch": 1.43, + "learning_rate": 3.126737333913344e-07, + "logits/chosen": -2.769333600997925, + "logits/rejected": -2.768833637237549, + "logps/chosen": -220.874755859375, + "logps/rejected": -203.6051483154297, + "loss": 0.9021, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.1960815042257309, + "rewards/margins": 0.13010276854038239, + "rewards/rejected": -0.32618430256843567, + "step": 4910 + }, + { + "epoch": 1.44, + "learning_rate": 3.1185185323326194e-07, + "logits/chosen": -2.7267374992370605, + "logits/rejected": -2.768446445465088, + "logps/chosen": -182.239990234375, + "logps/rejected": -190.83349609375, + "loss": 0.906, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.19862093031406403, + "rewards/margins": 0.08212271332740784, + "rewards/rejected": -0.28074365854263306, + "step": 4920 + }, + { + "epoch": 1.44, + "learning_rate": 3.110292603875956e-07, + "logits/chosen": -2.7466022968292236, + "logits/rejected": -2.7096469402313232, + "logps/chosen": -208.9374542236328, + "logps/rejected": -182.35670471191406, + "loss": 0.948, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2167479544878006, + "rewards/margins": 0.04567436873912811, + "rewards/rejected": -0.2624223232269287, + "step": 4930 + }, + { + "epoch": 1.44, + "learning_rate": 3.1020596433265635e-07, + "logits/chosen": -2.7337393760681152, + "logits/rejected": -2.732323408126831, + "logps/chosen": -184.12216186523438, + "logps/rejected": -185.9611053466797, + "loss": 0.9425, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2332111895084381, + "rewards/margins": 0.05965740606188774, + "rewards/rejected": -0.29286855459213257, + "step": 4940 + }, + { + "epoch": 1.44, + "learning_rate": 3.0938197455486783e-07, + "logits/chosen": -2.6897988319396973, + "logits/rejected": -2.7108638286590576, + "logps/chosen": -186.6949005126953, + "logps/rejected": -181.40200805664062, + "loss": 0.9367, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2100130021572113, + "rewards/margins": 0.0862884670495987, + "rewards/rejected": -0.2963015139102936, + "step": 4950 + }, + { + "epoch": 1.45, + "learning_rate": 3.08557300548647e-07, + "logits/chosen": -2.7417614459991455, + "logits/rejected": -2.7184202671051025, + "logps/chosen": -192.87637329101562, + "logps/rejected": -162.53128051757812, + "loss": 0.9197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19202032685279846, + "rewards/margins": 0.07406075298786163, + "rewards/rejected": -0.2660810351371765, + "step": 4960 + }, + { + "epoch": 1.45, + "learning_rate": 3.077319518162952e-07, + "logits/chosen": -2.730668544769287, + "logits/rejected": -2.739375352859497, + "logps/chosen": -196.22915649414062, + "logps/rejected": -202.01319885253906, + "loss": 0.9224, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.21786180138587952, + "rewards/margins": 0.07545635849237442, + "rewards/rejected": -0.29331815242767334, + "step": 4970 + }, + { + "epoch": 1.45, + "learning_rate": 3.069059378678878e-07, + "logits/chosen": -2.732133150100708, + "logits/rejected": -2.7478561401367188, + "logps/chosen": -178.5888671875, + "logps/rejected": -178.92031860351562, + "loss": 0.9194, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18372566998004913, + "rewards/margins": 0.10189918428659439, + "rewards/rejected": -0.28562483191490173, + "step": 4980 + }, + { + "epoch": 1.46, + "learning_rate": 3.0607926822116564e-07, + "logits/chosen": -2.73771333694458, + "logits/rejected": -2.738654851913452, + "logps/chosen": -178.12435913085938, + "logps/rejected": -174.51303100585938, + "loss": 0.9264, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.19615371525287628, + "rewards/margins": 0.07561203837394714, + "rewards/rejected": -0.27176573872566223, + "step": 4990 + }, + { + "epoch": 1.46, + "learning_rate": 3.0525195240142437e-07, + "logits/chosen": -2.7430174350738525, + "logits/rejected": -2.755254030227661, + "logps/chosen": -195.0157470703125, + "logps/rejected": -188.9918975830078, + "loss": 0.94, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.16923925280570984, + "rewards/margins": 0.08428038656711578, + "rewards/rejected": -0.25351962447166443, + "step": 5000 + }, + { + "epoch": 1.46, + "eval_logits/chosen": -2.6492247581481934, + "eval_logits/rejected": -2.643824577331543, + "eval_logps/chosen": -197.6009521484375, + "eval_logps/rejected": -183.6591033935547, + "eval_loss": 0.9352905750274658, + "eval_rewards/accuracies": 0.580047607421875, + "eval_rewards/chosen": -0.19956253468990326, + "eval_rewards/margins": 0.07421758025884628, + "eval_rewards/rejected": -0.27378013730049133, + "eval_runtime": 443.4518, + "eval_samples_per_second": 26.531, + "eval_steps_per_second": 3.317, + "step": 5000 + }, + { + "epoch": 1.46, + "learning_rate": 3.044239999414055e-07, + "logits/chosen": -2.7292063236236572, + "logits/rejected": -2.706313133239746, + "logps/chosen": -184.35232543945312, + "logps/rejected": -179.50625610351562, + "loss": 0.9422, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.20498375594615936, + "rewards/margins": 0.06014307588338852, + "rewards/rejected": -0.2651267945766449, + "step": 5010 + }, + { + "epoch": 1.46, + "learning_rate": 3.0359542038118606e-07, + "logits/chosen": -2.742978572845459, + "logits/rejected": -2.734860420227051, + "logps/chosen": -206.57275390625, + "logps/rejected": -190.71548461914062, + "loss": 0.9089, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.19854263961315155, + "rewards/margins": 0.08930404484272003, + "rewards/rejected": -0.2878466844558716, + "step": 5020 + }, + { + "epoch": 1.47, + "learning_rate": 3.027662232680689e-07, + "logits/chosen": -2.72959566116333, + "logits/rejected": -2.729051113128662, + "logps/chosen": -195.5106964111328, + "logps/rejected": -171.5630645751953, + "loss": 0.9356, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.18533293902873993, + "rewards/margins": 0.09665311127901077, + "rewards/rejected": -0.2819860577583313, + "step": 5030 + }, + { + "epoch": 1.47, + "learning_rate": 3.0193641815647255e-07, + "logits/chosen": -2.735405445098877, + "logits/rejected": -2.7377450466156006, + "logps/chosen": -194.8864288330078, + "logps/rejected": -184.33419799804688, + "loss": 0.9143, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20550887286663055, + "rewards/margins": 0.09748460352420807, + "rewards/rejected": -0.3029934763908386, + "step": 5040 + }, + { + "epoch": 1.47, + "learning_rate": 3.011060146078212e-07, + "logits/chosen": -2.740532636642456, + "logits/rejected": -2.745987892150879, + "logps/chosen": -198.83352661132812, + "logps/rejected": -194.37173461914062, + "loss": 0.9368, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.22939512133598328, + "rewards/margins": 0.06054091453552246, + "rewards/rejected": -0.28993600606918335, + "step": 5050 + }, + { + "epoch": 1.48, + "learning_rate": 3.002750221904347e-07, + "logits/chosen": -2.7253835201263428, + "logits/rejected": -2.7207252979278564, + "logps/chosen": -188.45120239257812, + "logps/rejected": -186.4692840576172, + "loss": 0.9424, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.21971821784973145, + "rewards/margins": 0.0432281419634819, + "rewards/rejected": -0.26294639706611633, + "step": 5060 + }, + { + "epoch": 1.48, + "learning_rate": 2.9944345047941785e-07, + "logits/chosen": -2.716625928878784, + "logits/rejected": -2.7393455505371094, + "logps/chosen": -203.8828582763672, + "logps/rejected": -200.91465759277344, + "loss": 0.9127, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2097250521183014, + "rewards/margins": 0.07229788601398468, + "rewards/rejected": -0.2820229232311249, + "step": 5070 + }, + { + "epoch": 1.48, + "learning_rate": 2.9861130905655065e-07, + "logits/chosen": -2.730515956878662, + "logits/rejected": -2.6795966625213623, + "logps/chosen": -215.6951904296875, + "logps/rejected": -176.55252075195312, + "loss": 0.9354, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2164064645767212, + "rewards/margins": 0.047541793435811996, + "rewards/rejected": -0.2639482617378235, + "step": 5080 + }, + { + "epoch": 1.48, + "learning_rate": 2.977786075101774e-07, + "logits/chosen": -2.740309953689575, + "logits/rejected": -2.7642016410827637, + "logps/chosen": -186.1707305908203, + "logps/rejected": -197.3705596923828, + "loss": 0.9294, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.23175272345542908, + "rewards/margins": 0.0802375078201294, + "rewards/rejected": -0.31199023127555847, + "step": 5090 + }, + { + "epoch": 1.49, + "learning_rate": 2.9694535543509653e-07, + "logits/chosen": -2.7505338191986084, + "logits/rejected": -2.7634050846099854, + "logps/chosen": -180.63919067382812, + "logps/rejected": -180.73904418945312, + "loss": 0.9288, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.21512751281261444, + "rewards/margins": 0.08128011971712112, + "rewards/rejected": -0.29640763998031616, + "step": 5100 + }, + { + "epoch": 1.49, + "eval_logits/chosen": -2.6486804485321045, + "eval_logits/rejected": -2.6432888507843018, + "eval_logps/chosen": -197.6044921875, + "eval_logps/rejected": -183.66253662109375, + "eval_loss": 0.9350979924201965, + "eval_rewards/accuracies": 0.580897331237793, + "eval_rewards/chosen": -0.19991926848888397, + "eval_rewards/margins": 0.07420553267002106, + "eval_rewards/rejected": -0.27412480115890503, + "eval_runtime": 443.5316, + "eval_samples_per_second": 26.526, + "eval_steps_per_second": 3.317, + "step": 5100 + }, + { + "epoch": 1.49, + "learning_rate": 2.961115624324499e-07, + "logits/chosen": -2.724372148513794, + "logits/rejected": -2.7239785194396973, + "logps/chosen": -184.83824157714844, + "logps/rejected": -166.89259338378906, + "loss": 0.9269, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.2036965787410736, + "rewards/margins": 0.06950495392084122, + "rewards/rejected": -0.27320152521133423, + "step": 5110 + }, + { + "epoch": 1.49, + "learning_rate": 2.9527723810961207e-07, + "logits/chosen": -2.735801935195923, + "logits/rejected": -2.7238357067108154, + "logps/chosen": -196.07200622558594, + "logps/rejected": -173.1053009033203, + "loss": 0.9487, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.20888221263885498, + "rewards/margins": 0.05614093691110611, + "rewards/rejected": -0.2650231420993805, + "step": 5120 + }, + { + "epoch": 1.5, + "learning_rate": 2.9444239208008e-07, + "logits/chosen": -2.7660791873931885, + "logits/rejected": -2.7649636268615723, + "logps/chosen": -212.7071990966797, + "logps/rejected": -200.44374084472656, + "loss": 0.9248, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.21736297011375427, + "rewards/margins": 0.06548234075307846, + "rewards/rejected": -0.28284531831741333, + "step": 5130 + }, + { + "epoch": 1.5, + "learning_rate": 2.936070339633618e-07, + "logits/chosen": -2.736414909362793, + "logits/rejected": -2.727729320526123, + "logps/chosen": -184.80453491210938, + "logps/rejected": -174.11581420898438, + "loss": 0.9251, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.18705031275749207, + "rewards/margins": 0.08300456404685974, + "rewards/rejected": -0.2700548768043518, + "step": 5140 + }, + { + "epoch": 1.5, + "learning_rate": 2.9277117338486616e-07, + "logits/chosen": -2.738631248474121, + "logits/rejected": -2.7362170219421387, + "logps/chosen": -184.38180541992188, + "logps/rejected": -176.70655822753906, + "loss": 0.9248, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.22087529301643372, + "rewards/margins": 0.07811127603054047, + "rewards/rejected": -0.29898661375045776, + "step": 5150 + }, + { + "epoch": 1.51, + "learning_rate": 2.9193481997579133e-07, + "logits/chosen": -2.7428336143493652, + "logits/rejected": -2.723949670791626, + "logps/chosen": -219.9194793701172, + "logps/rejected": -203.8686981201172, + "loss": 0.9388, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.23469701409339905, + "rewards/margins": 0.06632021069526672, + "rewards/rejected": -0.30101722478866577, + "step": 5160 + }, + { + "epoch": 1.51, + "learning_rate": 2.910979833730145e-07, + "logits/chosen": -2.7712066173553467, + "logits/rejected": -2.741729259490967, + "logps/chosen": -203.03404235839844, + "logps/rejected": -181.98988342285156, + "loss": 0.9129, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.20010265707969666, + "rewards/margins": 0.08742941915988922, + "rewards/rejected": -0.2875320315361023, + "step": 5170 + }, + { + "epoch": 1.51, + "learning_rate": 2.9026067321897995e-07, + "logits/chosen": -2.717153787612915, + "logits/rejected": -2.7007384300231934, + "logps/chosen": -193.92271423339844, + "logps/rejected": -167.20896911621094, + "loss": 0.9185, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.19192959368228912, + "rewards/margins": 0.06471812725067139, + "rewards/rejected": -0.2566477060317993, + "step": 5180 + }, + { + "epoch": 1.51, + "learning_rate": 2.8942289916158883e-07, + "logits/chosen": -2.7420923709869385, + "logits/rejected": -2.760744571685791, + "logps/chosen": -181.37686157226562, + "logps/rejected": -184.68063354492188, + "loss": 0.9454, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2095378190279007, + "rewards/margins": 0.06707773357629776, + "rewards/rejected": -0.27661553025245667, + "step": 5190 + }, + { + "epoch": 1.52, + "learning_rate": 2.8858467085408763e-07, + "logits/chosen": -2.76371431350708, + "logits/rejected": -2.763125419616699, + "logps/chosen": -198.66419982910156, + "logps/rejected": -195.47776794433594, + "loss": 0.927, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23758378624916077, + "rewards/margins": 0.06860102713108063, + "rewards/rejected": -0.3061848282814026, + "step": 5200 + }, + { + "epoch": 1.52, + "eval_logits/chosen": -2.6552000045776367, + "eval_logits/rejected": -2.6499037742614746, + "eval_logps/chosen": -197.61444091796875, + "eval_logps/rejected": -183.68826293945312, + "eval_loss": 0.9342753887176514, + "eval_rewards/accuracies": 0.5820870399475098, + "eval_rewards/chosen": -0.2009136825799942, + "eval_rewards/margins": 0.07578551024198532, + "eval_rewards/rejected": -0.2766991853713989, + "eval_runtime": 443.3793, + "eval_samples_per_second": 26.535, + "eval_steps_per_second": 3.318, + "step": 5200 + }, + { + "epoch": 1.52, + "learning_rate": 2.877459979549566e-07, + "logits/chosen": -2.7898855209350586, + "logits/rejected": -2.7748923301696777, + "logps/chosen": -210.88479614257812, + "logps/rejected": -194.19326782226562, + "loss": 0.9279, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19191429018974304, + "rewards/margins": 0.08744990825653076, + "rewards/rejected": -0.2793641984462738, + "step": 5210 + }, + { + "epoch": 1.52, + "learning_rate": 2.869068901277991e-07, + "logits/chosen": -2.7477736473083496, + "logits/rejected": -2.7583398818969727, + "logps/chosen": -197.46267700195312, + "logps/rejected": -200.65147399902344, + "loss": 0.9151, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.18644069135189056, + "rewards/margins": 0.09844745695590973, + "rewards/rejected": -0.2848881483078003, + "step": 5220 + }, + { + "epoch": 1.53, + "learning_rate": 2.860673570412297e-07, + "logits/chosen": -2.758688449859619, + "logits/rejected": -2.746370792388916, + "logps/chosen": -198.10745239257812, + "logps/rejected": -179.736328125, + "loss": 0.9259, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.2171638011932373, + "rewards/margins": 0.04720696806907654, + "rewards/rejected": -0.26437076926231384, + "step": 5230 + }, + { + "epoch": 1.53, + "learning_rate": 2.852274083687634e-07, + "logits/chosen": -2.754690647125244, + "logits/rejected": -2.7557036876678467, + "logps/chosen": -189.4248809814453, + "logps/rejected": -189.26260375976562, + "loss": 0.9368, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.199670672416687, + "rewards/margins": 0.05970006436109543, + "rewards/rejected": -0.25937071442604065, + "step": 5240 + }, + { + "epoch": 1.53, + "learning_rate": 2.8438705378870337e-07, + "logits/chosen": -2.7454700469970703, + "logits/rejected": -2.7141568660736084, + "logps/chosen": -206.284423828125, + "logps/rejected": -179.8996124267578, + "loss": 0.9198, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.19247445464134216, + "rewards/margins": 0.10617701709270477, + "rewards/rejected": -0.29865145683288574, + "step": 5250 + }, + { + "epoch": 1.53, + "learning_rate": 2.8354630298403015e-07, + "logits/chosen": -2.735435962677002, + "logits/rejected": -2.7617130279541016, + "logps/chosen": -211.7236328125, + "logps/rejected": -202.30496215820312, + "loss": 0.9375, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23901574313640594, + "rewards/margins": 0.07207988202571869, + "rewards/rejected": -0.31109562516212463, + "step": 5260 + }, + { + "epoch": 1.54, + "learning_rate": 2.827051656422895e-07, + "logits/chosen": -2.7626614570617676, + "logits/rejected": -2.724433422088623, + "logps/chosen": -216.4359893798828, + "logps/rejected": -187.43603515625, + "loss": 0.9277, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.21994581818580627, + "rewards/margins": 0.08216341584920883, + "rewards/rejected": -0.3021092116832733, + "step": 5270 + }, + { + "epoch": 1.54, + "learning_rate": 2.818636514554814e-07, + "logits/chosen": -2.7331411838531494, + "logits/rejected": -2.714127779006958, + "logps/chosen": -202.60743713378906, + "logps/rejected": -178.23214721679688, + "loss": 0.9166, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.20016559958457947, + "rewards/margins": 0.08119861781597137, + "rewards/rejected": -0.28136423230171204, + "step": 5280 + }, + { + "epoch": 1.54, + "learning_rate": 2.810217701199478e-07, + "logits/chosen": -2.7301182746887207, + "logits/rejected": -2.7275664806365967, + "logps/chosen": -196.49765014648438, + "logps/rejected": -185.9195556640625, + "loss": 0.9229, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.19735606014728546, + "rewards/margins": 0.09479983896017075, + "rewards/rejected": -0.292155921459198, + "step": 5290 + }, + { + "epoch": 1.55, + "learning_rate": 2.801795313362609e-07, + "logits/chosen": -2.7671756744384766, + "logits/rejected": -2.7629024982452393, + "logps/chosen": -205.7484893798828, + "logps/rejected": -188.64962768554688, + "loss": 0.9171, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.20872464776039124, + "rewards/margins": 0.07953401654958725, + "rewards/rejected": -0.2882586419582367, + "step": 5300 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.6475534439086914, + "eval_logits/rejected": -2.642137050628662, + "eval_logps/chosen": -197.62921142578125, + "eval_logps/rejected": -183.7055206298828, + "eval_loss": 0.933870255947113, + "eval_rewards/accuracies": 0.5822569727897644, + "eval_rewards/chosen": -0.20238925516605377, + "eval_rewards/margins": 0.0760333314538002, + "eval_rewards/rejected": -0.27842259407043457, + "eval_runtime": 443.3851, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 5300 + }, + { + "epoch": 1.55, + "learning_rate": 2.7933694480911217e-07, + "logits/chosen": -2.7444474697113037, + "logits/rejected": -2.756112575531006, + "logps/chosen": -191.1089630126953, + "logps/rejected": -194.6819305419922, + "loss": 0.9219, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20938988029956818, + "rewards/margins": 0.10179195553064346, + "rewards/rejected": -0.31118181347846985, + "step": 5310 + }, + { + "epoch": 1.55, + "learning_rate": 2.7849402024719944e-07, + "logits/chosen": -2.747704267501831, + "logits/rejected": -2.757530689239502, + "logps/chosen": -217.6678466796875, + "logps/rejected": -210.233154296875, + "loss": 0.9241, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.21528121829032898, + "rewards/margins": 0.08019135892391205, + "rewards/rejected": -0.29547256231307983, + "step": 5320 + }, + { + "epoch": 1.55, + "learning_rate": 2.7765076736311575e-07, + "logits/chosen": -2.72330641746521, + "logits/rejected": -2.718510150909424, + "logps/chosen": -178.87374877929688, + "logps/rejected": -171.4022979736328, + "loss": 0.9264, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.213222935795784, + "rewards/margins": 0.059436433017253876, + "rewards/rejected": -0.2726593613624573, + "step": 5330 + }, + { + "epoch": 1.56, + "learning_rate": 2.7680719587323717e-07, + "logits/chosen": -2.7657723426818848, + "logits/rejected": -2.771605968475342, + "logps/chosen": -208.59555053710938, + "logps/rejected": -198.11004638671875, + "loss": 0.9155, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.20413896441459656, + "rewards/margins": 0.11837039142847061, + "rewards/rejected": -0.3225093483924866, + "step": 5340 + }, + { + "epoch": 1.56, + "learning_rate": 2.759633154976111e-07, + "logits/chosen": -2.7205915451049805, + "logits/rejected": -2.726088285446167, + "logps/chosen": -182.267822265625, + "logps/rejected": -174.0267791748047, + "loss": 0.9174, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.20186123251914978, + "rewards/margins": 0.08812803775072098, + "rewards/rejected": -0.28998929262161255, + "step": 5350 + }, + { + "epoch": 1.56, + "learning_rate": 2.7511913595984374e-07, + "logits/chosen": -2.7115683555603027, + "logits/rejected": -2.707627058029175, + "logps/chosen": -190.1064453125, + "logps/rejected": -178.34567260742188, + "loss": 0.9147, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.18654951453208923, + "rewards/margins": 0.10496105998754501, + "rewards/rejected": -0.29151061177253723, + "step": 5360 + }, + { + "epoch": 1.57, + "learning_rate": 2.7427466698698864e-07, + "logits/chosen": -2.7274367809295654, + "logits/rejected": -2.7309534549713135, + "logps/chosen": -203.6020965576172, + "logps/rejected": -197.11085510253906, + "loss": 0.9108, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.20246362686157227, + "rewards/margins": 0.10691912472248077, + "rewards/rejected": -0.30938273668289185, + "step": 5370 + }, + { + "epoch": 1.57, + "learning_rate": 2.7342991830943437e-07, + "logits/chosen": -2.766284942626953, + "logits/rejected": -2.7335293292999268, + "logps/chosen": -203.7217559814453, + "logps/rejected": -175.9837188720703, + "loss": 0.9585, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.22377462685108185, + "rewards/margins": 0.05499381572008133, + "rewards/rejected": -0.278768390417099, + "step": 5380 + }, + { + "epoch": 1.57, + "learning_rate": 2.7258489966079206e-07, + "logits/chosen": -2.7298882007598877, + "logits/rejected": -2.738774061203003, + "logps/chosen": -191.04055786132812, + "logps/rejected": -186.08273315429688, + "loss": 0.9449, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2002135068178177, + "rewards/margins": 0.08072539418935776, + "rewards/rejected": -0.28093892335891724, + "step": 5390 + }, + { + "epoch": 1.58, + "learning_rate": 2.717396207777841e-07, + "logits/chosen": -2.7327914237976074, + "logits/rejected": -2.7400810718536377, + "logps/chosen": -187.4174041748047, + "logps/rejected": -185.2515869140625, + "loss": 0.9337, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.204170823097229, + "rewards/margins": 0.06763770431280136, + "rewards/rejected": -0.27180856466293335, + "step": 5400 + }, + { + "epoch": 1.58, + "eval_logits/chosen": -2.6513025760650635, + "eval_logits/rejected": -2.6459460258483887, + "eval_logps/chosen": -197.645263671875, + "eval_logps/rejected": -183.72079467773438, + "eval_loss": 0.934425950050354, + "eval_rewards/accuracies": 0.5786879658699036, + "eval_rewards/chosen": -0.20399518311023712, + "eval_rewards/margins": 0.07595469802618027, + "eval_rewards/rejected": -0.2799498438835144, + "eval_runtime": 443.3253, + "eval_samples_per_second": 26.538, + "eval_steps_per_second": 3.318, + "step": 5400 + }, + { + "epoch": 1.58, + "learning_rate": 2.7089409140013103e-07, + "logits/chosen": -2.7303237915039062, + "logits/rejected": -2.7288081645965576, + "logps/chosen": -216.2913055419922, + "logps/rejected": -185.7354736328125, + "loss": 0.9246, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1843048334121704, + "rewards/margins": 0.08620087802410126, + "rewards/rejected": -0.2705056965351105, + "step": 5410 + }, + { + "epoch": 1.58, + "learning_rate": 2.700483212704398e-07, + "logits/chosen": -2.740948438644409, + "logits/rejected": -2.7331557273864746, + "logps/chosen": -200.00587463378906, + "logps/rejected": -190.20492553710938, + "loss": 0.931, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.22048547863960266, + "rewards/margins": 0.07095751911401749, + "rewards/rejected": -0.29144302010536194, + "step": 5420 + }, + { + "epoch": 1.58, + "learning_rate": 2.692023201340915e-07, + "logits/chosen": -2.7501890659332275, + "logits/rejected": -2.738179922103882, + "logps/chosen": -197.78414916992188, + "logps/rejected": -191.88436889648438, + "loss": 0.9055, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.25792795419692993, + "rewards/margins": 0.08034543693065643, + "rewards/rejected": -0.3382733464241028, + "step": 5430 + }, + { + "epoch": 1.59, + "learning_rate": 2.6835609773912903e-07, + "logits/chosen": -2.728278160095215, + "logits/rejected": -2.7290234565734863, + "logps/chosen": -184.92025756835938, + "logps/rejected": -178.275390625, + "loss": 0.9192, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.19944298267364502, + "rewards/margins": 0.10552481561899185, + "rewards/rejected": -0.30496782064437866, + "step": 5440 + }, + { + "epoch": 1.59, + "learning_rate": 2.675096638361446e-07, + "logits/chosen": -2.720247983932495, + "logits/rejected": -2.7001519203186035, + "logps/chosen": -200.8359832763672, + "logps/rejected": -184.19882202148438, + "loss": 0.9261, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.22018194198608398, + "rewards/margins": 0.06855995953083038, + "rewards/rejected": -0.2887418866157532, + "step": 5450 + }, + { + "epoch": 1.59, + "learning_rate": 2.666630281781676e-07, + "logits/chosen": -2.715918779373169, + "logits/rejected": -2.7154223918914795, + "logps/chosen": -202.9798126220703, + "logps/rejected": -189.34768676757812, + "loss": 0.9272, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.22410516440868378, + "rewards/margins": 0.09275824576616287, + "rewards/rejected": -0.31686341762542725, + "step": 5460 + }, + { + "epoch": 1.6, + "learning_rate": 2.658162005205522e-07, + "logits/chosen": -2.7463111877441406, + "logits/rejected": -2.7408008575439453, + "logps/chosen": -189.31149291992188, + "logps/rejected": -174.1709442138672, + "loss": 0.9472, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.21325497329235077, + "rewards/margins": 0.0632646232843399, + "rewards/rejected": -0.2765195965766907, + "step": 5470 + }, + { + "epoch": 1.6, + "learning_rate": 2.6496919062086466e-07, + "logits/chosen": -2.7319397926330566, + "logits/rejected": -2.7326912879943848, + "logps/chosen": -213.6542205810547, + "logps/rejected": -202.09652709960938, + "loss": 0.9157, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.20796532928943634, + "rewards/margins": 0.08152662217617035, + "rewards/rejected": -0.2894919216632843, + "step": 5480 + }, + { + "epoch": 1.6, + "learning_rate": 2.641220082387714e-07, + "logits/chosen": -2.7371435165405273, + "logits/rejected": -2.7322709560394287, + "logps/chosen": -189.59552001953125, + "logps/rejected": -177.81393432617188, + "loss": 0.9354, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.21587204933166504, + "rewards/margins": 0.09057654440402985, + "rewards/rejected": -0.3064486086368561, + "step": 5490 + }, + { + "epoch": 1.6, + "learning_rate": 2.6327466313592605e-07, + "logits/chosen": -2.7272610664367676, + "logits/rejected": -2.755542278289795, + "logps/chosen": -177.8417510986328, + "logps/rejected": -187.9776153564453, + "loss": 0.919, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.20907747745513916, + "rewards/margins": 0.09078783541917801, + "rewards/rejected": -0.2998653054237366, + "step": 5500 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -2.6445047855377197, + "eval_logits/rejected": -2.6390130519866943, + "eval_logps/chosen": -197.66366577148438, + "eval_logps/rejected": -183.74652099609375, + "eval_loss": 0.9334189295768738, + "eval_rewards/accuracies": 0.5810673236846924, + "eval_rewards/chosen": -0.2058352380990982, + "eval_rewards/margins": 0.07668833434581757, + "eval_rewards/rejected": -0.28252357244491577, + "eval_runtime": 443.355, + "eval_samples_per_second": 26.536, + "eval_steps_per_second": 3.318, + "step": 5500 + }, + { + "epoch": 1.61, + "learning_rate": 2.624271650758574e-07, + "logits/chosen": -2.6975483894348145, + "logits/rejected": -2.7088265419006348, + "logps/chosen": -176.8209991455078, + "logps/rejected": -166.51121520996094, + "loss": 0.9416, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.21087944507598877, + "rewards/margins": 0.05349407345056534, + "rewards/rejected": -0.2643735110759735, + "step": 5510 + }, + { + "epoch": 1.61, + "learning_rate": 2.615795238238565e-07, + "logits/chosen": -2.7559120655059814, + "logits/rejected": -2.741698741912842, + "logps/chosen": -211.35818481445312, + "logps/rejected": -183.8773956298828, + "loss": 0.9254, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1826242059469223, + "rewards/margins": 0.09974977374076843, + "rewards/rejected": -0.28237396478652954, + "step": 5520 + }, + { + "epoch": 1.61, + "learning_rate": 2.607317491468644e-07, + "logits/chosen": -2.738673210144043, + "logits/rejected": -2.741772174835205, + "logps/chosen": -191.3245086669922, + "logps/rejected": -174.14743041992188, + "loss": 0.9398, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.1919967234134674, + "rewards/margins": 0.0601605661213398, + "rewards/rejected": -0.2521572709083557, + "step": 5530 + }, + { + "epoch": 1.62, + "learning_rate": 2.598838508133596e-07, + "logits/chosen": -2.717775821685791, + "logits/rejected": -2.7444305419921875, + "logps/chosen": -176.27365112304688, + "logps/rejected": -180.37942504882812, + "loss": 0.9109, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.20145054161548615, + "rewards/margins": 0.07873423397541046, + "rewards/rejected": -0.2801847755908966, + "step": 5540 + }, + { + "epoch": 1.62, + "learning_rate": 2.590358385932452e-07, + "logits/chosen": -2.752577304840088, + "logits/rejected": -2.7411880493164062, + "logps/chosen": -188.3993682861328, + "logps/rejected": -179.72702026367188, + "loss": 0.9297, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20420292019844055, + "rewards/margins": 0.07471559941768646, + "rewards/rejected": -0.27891847491264343, + "step": 5550 + }, + { + "epoch": 1.62, + "learning_rate": 2.5818772225773704e-07, + "logits/chosen": -2.740549087524414, + "logits/rejected": -2.7294859886169434, + "logps/chosen": -214.0988311767578, + "logps/rejected": -194.05979919433594, + "loss": 0.9154, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1923362910747528, + "rewards/margins": 0.09265486896038055, + "rewards/rejected": -0.28499117493629456, + "step": 5560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5733951157924997e-07, + "logits/chosen": -2.7381107807159424, + "logits/rejected": -2.731104612350464, + "logps/chosen": -178.1517791748047, + "logps/rejected": -169.85348510742188, + "loss": 0.935, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2057117521762848, + "rewards/margins": 0.0582122728228569, + "rewards/rejected": -0.2639240622520447, + "step": 5570 + }, + { + "epoch": 1.63, + "learning_rate": 2.5649121633128656e-07, + "logits/chosen": -2.7527637481689453, + "logits/rejected": -2.7482333183288574, + "logps/chosen": -202.63143920898438, + "logps/rejected": -194.18113708496094, + "loss": 0.9452, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.22132262587547302, + "rewards/margins": 0.06279505789279938, + "rewards/rejected": -0.2841176986694336, + "step": 5580 + }, + { + "epoch": 1.63, + "learning_rate": 2.556428462883232e-07, + "logits/chosen": -2.724813222885132, + "logits/rejected": -2.7437031269073486, + "logps/chosen": -188.14248657226562, + "logps/rejected": -192.07470703125, + "loss": 0.9244, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.22033333778381348, + "rewards/margins": 0.08756639808416367, + "rewards/rejected": -0.30789971351623535, + "step": 5590 + }, + { + "epoch": 1.63, + "learning_rate": 2.5479441122569874e-07, + "logits/chosen": -2.7545359134674072, + "logits/rejected": -2.745806932449341, + "logps/chosen": -206.4933319091797, + "logps/rejected": -184.64808654785156, + "loss": 0.9297, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.21928980946540833, + "rewards/margins": 0.07845072448253632, + "rewards/rejected": -0.29774051904678345, + "step": 5600 + }, + { + "epoch": 1.63, + "eval_logits/chosen": -2.6471898555755615, + "eval_logits/rejected": -2.64178466796875, + "eval_logps/chosen": -197.658203125, + "eval_logps/rejected": -183.7437286376953, + "eval_loss": 0.9340550303459167, + "eval_rewards/accuracies": 0.5793677568435669, + "eval_rewards/chosen": -0.20529019832611084, + "eval_rewards/margins": 0.07695318758487701, + "eval_rewards/rejected": -0.28224337100982666, + "eval_runtime": 443.3931, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 5600 + }, + { + "epoch": 1.64, + "learning_rate": 2.539459209195007e-07, + "logits/chosen": -2.733870267868042, + "logits/rejected": -2.7074484825134277, + "logps/chosen": -214.43017578125, + "logps/rejected": -184.8483428955078, + "loss": 0.9295, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.19600990414619446, + "rewards/margins": 0.07425862550735474, + "rewards/rejected": -0.2702685296535492, + "step": 5610 + }, + { + "epoch": 1.64, + "learning_rate": 2.530973851464535e-07, + "logits/chosen": -2.7479662895202637, + "logits/rejected": -2.757333755493164, + "logps/chosen": -193.40345764160156, + "logps/rejected": -175.7198028564453, + "loss": 0.9435, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.21974997222423553, + "rewards/margins": 0.0870782732963562, + "rewards/rejected": -0.30682826042175293, + "step": 5620 + }, + { + "epoch": 1.64, + "learning_rate": 2.5224881368380513e-07, + "logits/chosen": -2.7397866249084473, + "logits/rejected": -2.738558292388916, + "logps/chosen": -198.94163513183594, + "logps/rejected": -190.87486267089844, + "loss": 0.9213, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.18970100581645966, + "rewards/margins": 0.09227032959461212, + "rewards/rejected": -0.2819713056087494, + "step": 5630 + }, + { + "epoch": 1.65, + "learning_rate": 2.514002163092152e-07, + "logits/chosen": -2.722203016281128, + "logits/rejected": -2.7369751930236816, + "logps/chosen": -216.5584259033203, + "logps/rejected": -201.0596466064453, + "loss": 0.9286, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.2221214771270752, + "rewards/margins": 0.07766957581043243, + "rewards/rejected": -0.29979103803634644, + "step": 5640 + }, + { + "epoch": 1.65, + "learning_rate": 2.5055160280064145e-07, + "logits/chosen": -2.728950023651123, + "logits/rejected": -2.7567801475524902, + "logps/chosen": -175.72958374023438, + "logps/rejected": -188.25039672851562, + "loss": 0.9278, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.23196330666542053, + "rewards/margins": 0.06436417996883392, + "rewards/rejected": -0.2963274419307709, + "step": 5650 + }, + { + "epoch": 1.65, + "learning_rate": 2.497029829362279e-07, + "logits/chosen": -2.701756000518799, + "logits/rejected": -2.7560033798217773, + "logps/chosen": -176.56668090820312, + "logps/rejected": -190.84999084472656, + "loss": 0.9211, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.19433894753456116, + "rewards/margins": 0.09112901985645294, + "rewards/rejected": -0.2854679524898529, + "step": 5660 + }, + { + "epoch": 1.65, + "learning_rate": 2.488543664941916e-07, + "logits/chosen": -2.7353200912475586, + "logits/rejected": -2.7525343894958496, + "logps/chosen": -192.02662658691406, + "logps/rejected": -188.41783142089844, + "loss": 0.9235, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.20639260113239288, + "rewards/margins": 0.10132446140050888, + "rewards/rejected": -0.30771705508232117, + "step": 5670 + }, + { + "epoch": 1.66, + "learning_rate": 2.480057632527103e-07, + "logits/chosen": -2.732024908065796, + "logits/rejected": -2.7357800006866455, + "logps/chosen": -174.36900329589844, + "logps/rejected": -170.44863891601562, + "loss": 0.9486, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.22138449549674988, + "rewards/margins": 0.05923574045300484, + "rewards/rejected": -0.2806202471256256, + "step": 5680 + }, + { + "epoch": 1.66, + "learning_rate": 2.471571829898095e-07, + "logits/chosen": -2.7303781509399414, + "logits/rejected": -2.750192880630493, + "logps/chosen": -205.6610565185547, + "logps/rejected": -206.914794921875, + "loss": 0.919, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.21082866191864014, + "rewards/margins": 0.08251901715993881, + "rewards/rejected": -0.29334768652915955, + "step": 5690 + }, + { + "epoch": 1.66, + "learning_rate": 2.4630863548325e-07, + "logits/chosen": -2.735438585281372, + "logits/rejected": -2.7419936656951904, + "logps/chosen": -188.0749969482422, + "logps/rejected": -184.5003204345703, + "loss": 0.9174, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.196400448679924, + "rewards/margins": 0.1187920793890953, + "rewards/rejected": -0.3151925206184387, + "step": 5700 + }, + { + "epoch": 1.66, + "eval_logits/chosen": -2.654500961303711, + "eval_logits/rejected": -2.6492364406585693, + "eval_logps/chosen": -197.672607421875, + "eval_logps/rejected": -183.75535583496094, + "eval_loss": 0.9332903027534485, + "eval_rewards/accuracies": 0.580047607421875, + "eval_rewards/chosen": -0.20673073828220367, + "eval_rewards/margins": 0.07667768001556396, + "eval_rewards/rejected": -0.28340843319892883, + "eval_runtime": 443.3386, + "eval_samples_per_second": 26.537, + "eval_steps_per_second": 3.318, + "step": 5700 + }, + { + "epoch": 1.67, + "learning_rate": 2.4546013051041514e-07, + "logits/chosen": -2.7554385662078857, + "logits/rejected": -2.755969524383545, + "logps/chosen": -189.93936157226562, + "logps/rejected": -173.48239135742188, + "loss": 0.9304, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.20026805996894836, + "rewards/margins": 0.09992248564958572, + "rewards/rejected": -0.3001905381679535, + "step": 5710 + }, + { + "epoch": 1.67, + "learning_rate": 2.4461167784819827e-07, + "logits/chosen": -2.728515625, + "logits/rejected": -2.7146217823028564, + "logps/chosen": -191.40211486816406, + "logps/rejected": -182.94676208496094, + "loss": 0.9361, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.21724525094032288, + "rewards/margins": 0.07030661404132843, + "rewards/rejected": -0.2875518202781677, + "step": 5720 + }, + { + "epoch": 1.67, + "learning_rate": 2.4376328727288974e-07, + "logits/chosen": -2.7447657585144043, + "logits/rejected": -2.751224994659424, + "logps/chosen": -178.4643096923828, + "logps/rejected": -178.66090393066406, + "loss": 0.9383, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2255851775407791, + "rewards/margins": 0.05275057628750801, + "rewards/rejected": -0.2783357501029968, + "step": 5730 + }, + { + "epoch": 1.67, + "learning_rate": 2.429149685600648e-07, + "logits/chosen": -2.729738473892212, + "logits/rejected": -2.698761224746704, + "logps/chosen": -183.81948852539062, + "logps/rejected": -174.73214721679688, + "loss": 0.9379, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.21086111664772034, + "rewards/margins": 0.05557774752378464, + "rewards/rejected": -0.26643887162208557, + "step": 5740 + }, + { + "epoch": 1.68, + "learning_rate": 2.4206673148447066e-07, + "logits/chosen": -2.726379871368408, + "logits/rejected": -2.706997871398926, + "logps/chosen": -181.8163604736328, + "logps/rejected": -158.59722900390625, + "loss": 0.9467, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.19309577345848083, + "rewards/margins": 0.061171580106019974, + "rewards/rejected": -0.2542673647403717, + "step": 5750 + }, + { + "epoch": 1.68, + "learning_rate": 2.4121858581991353e-07, + "logits/chosen": -2.734316110610962, + "logits/rejected": -2.7303547859191895, + "logps/chosen": -194.5187530517578, + "logps/rejected": -185.96414184570312, + "loss": 0.9143, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.20588722825050354, + "rewards/margins": 0.0946664959192276, + "rewards/rejected": -0.30055373907089233, + "step": 5760 + }, + { + "epoch": 1.68, + "learning_rate": 2.403705413391467e-07, + "logits/chosen": -2.7414212226867676, + "logits/rejected": -2.7207446098327637, + "logps/chosen": -213.69198608398438, + "logps/rejected": -181.99899291992188, + "loss": 0.928, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2160993367433548, + "rewards/margins": 0.0850725769996643, + "rewards/rejected": -0.3011718988418579, + "step": 5770 + }, + { + "epoch": 1.69, + "learning_rate": 2.3952260781375726e-07, + "logits/chosen": -2.7480435371398926, + "logits/rejected": -2.7473959922790527, + "logps/chosen": -205.23489379882812, + "logps/rejected": -193.68539428710938, + "loss": 0.9355, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.211337611079216, + "rewards/margins": 0.06998740881681442, + "rewards/rejected": -0.28132501244544983, + "step": 5780 + }, + { + "epoch": 1.69, + "learning_rate": 2.386747950140541e-07, + "logits/chosen": -2.7476816177368164, + "logits/rejected": -2.7682197093963623, + "logps/chosen": -200.57284545898438, + "logps/rejected": -198.715087890625, + "loss": 0.9177, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.20661315321922302, + "rewards/margins": 0.1073889508843422, + "rewards/rejected": -0.3140020966529846, + "step": 5790 + }, + { + "epoch": 1.69, + "learning_rate": 2.3782711270895492e-07, + "logits/chosen": -2.7584152221679688, + "logits/rejected": -2.748316764831543, + "logps/chosen": -203.68675231933594, + "logps/rejected": -195.41549682617188, + "loss": 0.9275, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2284468412399292, + "rewards/margins": 0.08097778260707855, + "rewards/rejected": -0.30942457914352417, + "step": 5800 + }, + { + "epoch": 1.69, + "eval_logits/chosen": -2.6524062156677246, + "eval_logits/rejected": -2.647106647491455, + "eval_logps/chosen": -197.66415405273438, + "eval_logps/rejected": -183.74755859375, + "eval_loss": 0.9332142472267151, + "eval_rewards/accuracies": 0.5759687423706055, + "eval_rewards/chosen": -0.20588359236717224, + "eval_rewards/margins": 0.07674256712198257, + "eval_rewards/rejected": -0.2826261818408966, + "eval_runtime": 443.3985, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 5800 + }, + { + "epoch": 1.69, + "learning_rate": 2.3697957066587383e-07, + "logits/chosen": -2.7346596717834473, + "logits/rejected": -2.7185769081115723, + "logps/chosen": -200.0314178466797, + "logps/rejected": -178.08457946777344, + "loss": 0.9587, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": -0.2539510726928711, + "rewards/margins": 0.03835904970765114, + "rewards/rejected": -0.29231011867523193, + "step": 5810 + }, + { + "epoch": 1.7, + "learning_rate": 2.3613217865060852e-07, + "logits/chosen": -2.7300784587860107, + "logits/rejected": -2.708784818649292, + "logps/chosen": -197.45974731445312, + "logps/rejected": -175.445556640625, + "loss": 0.9341, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.23136886954307556, + "rewards/margins": 0.04439845681190491, + "rewards/rejected": -0.27576732635498047, + "step": 5820 + }, + { + "epoch": 1.7, + "learning_rate": 2.352849464272285e-07, + "logits/chosen": -2.74354887008667, + "logits/rejected": -2.7220675945281982, + "logps/chosen": -223.2447509765625, + "logps/rejected": -192.660400390625, + "loss": 0.929, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.2157532423734665, + "rewards/margins": 0.08507034182548523, + "rewards/rejected": -0.3008235991001129, + "step": 5830 + }, + { + "epoch": 1.7, + "learning_rate": 2.3443788375796174e-07, + "logits/chosen": -2.7344906330108643, + "logits/rejected": -2.7282562255859375, + "logps/chosen": -220.3262176513672, + "logps/rejected": -205.54330444335938, + "loss": 0.9263, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.24231091141700745, + "rewards/margins": 0.062385208904743195, + "rewards/rejected": -0.30469608306884766, + "step": 5840 + }, + { + "epoch": 1.71, + "learning_rate": 2.3359100040308243e-07, + "logits/chosen": -2.7421112060546875, + "logits/rejected": -2.723379135131836, + "logps/chosen": -198.9661407470703, + "logps/rejected": -178.9770965576172, + "loss": 0.9394, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.2452888786792755, + "rewards/margins": 0.04774923250079155, + "rewards/rejected": -0.29303810000419617, + "step": 5850 + }, + { + "epoch": 1.71, + "learning_rate": 2.3274430612079892e-07, + "logits/chosen": -2.7680325508117676, + "logits/rejected": -2.7334728240966797, + "logps/chosen": -216.655517578125, + "logps/rejected": -190.80474853515625, + "loss": 0.9297, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2257642298936844, + "rewards/margins": 0.07767541706562042, + "rewards/rejected": -0.3034396171569824, + "step": 5860 + }, + { + "epoch": 1.71, + "learning_rate": 2.318978106671407e-07, + "logits/chosen": -2.740093231201172, + "logits/rejected": -2.7320263385772705, + "logps/chosen": -188.4129180908203, + "logps/rejected": -181.80380249023438, + "loss": 0.9246, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22893789410591125, + "rewards/margins": 0.065252386033535, + "rewards/rejected": -0.29419028759002686, + "step": 5870 + }, + { + "epoch": 1.72, + "learning_rate": 2.3105152379584642e-07, + "logits/chosen": -2.7359161376953125, + "logits/rejected": -2.7144532203674316, + "logps/chosen": -198.8520965576172, + "logps/rejected": -170.20571899414062, + "loss": 0.9385, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.1944771260023117, + "rewards/margins": 0.07553229480981827, + "rewards/rejected": -0.27000942826271057, + "step": 5880 + }, + { + "epoch": 1.72, + "learning_rate": 2.3020545525825119e-07, + "logits/chosen": -2.7251765727996826, + "logits/rejected": -2.7204253673553467, + "logps/chosen": -212.84732055664062, + "logps/rejected": -192.09767150878906, + "loss": 0.9052, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.1890733242034912, + "rewards/margins": 0.10702107846736908, + "rewards/rejected": -0.2960943877696991, + "step": 5890 + }, + { + "epoch": 1.72, + "learning_rate": 2.2935961480317463e-07, + "logits/chosen": -2.71047306060791, + "logits/rejected": -2.7011935710906982, + "logps/chosen": -199.55006408691406, + "logps/rejected": -189.4619140625, + "loss": 0.9164, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.21190086007118225, + "rewards/margins": 0.0709785670042038, + "rewards/rejected": -0.28287941217422485, + "step": 5900 + }, + { + "epoch": 1.72, + "eval_logits/chosen": -2.644160270690918, + "eval_logits/rejected": -2.638662576675415, + "eval_logps/chosen": -197.6846923828125, + "eval_logps/rejected": -183.78807067871094, + "eval_loss": 0.9320964217185974, + "eval_rewards/accuracies": 0.580897331237793, + "eval_rewards/chosen": -0.20793870091438293, + "eval_rewards/margins": 0.07873953133821487, + "eval_rewards/rejected": -0.2866782546043396, + "eval_runtime": 443.3868, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 5900 + }, + { + "epoch": 1.72, + "learning_rate": 2.2851401217680788e-07, + "logits/chosen": -2.7377264499664307, + "logits/rejected": -2.739065408706665, + "logps/chosen": -185.49278259277344, + "logps/rejected": -182.3894500732422, + "loss": 0.9262, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.21655301749706268, + "rewards/margins": 0.05303264781832695, + "rewards/rejected": -0.26958566904067993, + "step": 5910 + }, + { + "epoch": 1.73, + "learning_rate": 2.2766865712260217e-07, + "logits/chosen": -2.7535338401794434, + "logits/rejected": -2.753856897354126, + "logps/chosen": -199.83055114746094, + "logps/rejected": -184.28892517089844, + "loss": 0.9271, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.21020498871803284, + "rewards/margins": 0.10383982956409454, + "rewards/rejected": -0.3140447735786438, + "step": 5920 + }, + { + "epoch": 1.73, + "learning_rate": 2.2682355938115583e-07, + "logits/chosen": -2.744236469268799, + "logits/rejected": -2.752063035964966, + "logps/chosen": -203.16470336914062, + "logps/rejected": -185.2848663330078, + "loss": 0.9166, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.20815984904766083, + "rewards/margins": 0.0902753472328186, + "rewards/rejected": -0.2984352111816406, + "step": 5930 + }, + { + "epoch": 1.73, + "learning_rate": 2.2597872869010218e-07, + "logits/chosen": -2.753807544708252, + "logits/rejected": -2.7454967498779297, + "logps/chosen": -215.274169921875, + "logps/rejected": -195.089599609375, + "loss": 0.903, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.21169881522655487, + "rewards/margins": 0.11044590175151825, + "rewards/rejected": -0.3221447467803955, + "step": 5940 + }, + { + "epoch": 1.74, + "learning_rate": 2.2513417478399777e-07, + "logits/chosen": -2.735591411590576, + "logits/rejected": -2.7258079051971436, + "logps/chosen": -168.94375610351562, + "logps/rejected": -160.7762451171875, + "loss": 0.924, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.21011146903038025, + "rewards/margins": 0.07576417922973633, + "rewards/rejected": -0.2858756184577942, + "step": 5950 + }, + { + "epoch": 1.74, + "learning_rate": 2.2428990739420954e-07, + "logits/chosen": -2.7638401985168457, + "logits/rejected": -2.732102870941162, + "logps/chosen": -205.1365203857422, + "logps/rejected": -182.8333740234375, + "loss": 0.9109, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.20985054969787598, + "rewards/margins": 0.0838356763124466, + "rewards/rejected": -0.2936862111091614, + "step": 5960 + }, + { + "epoch": 1.74, + "learning_rate": 2.2344593624880342e-07, + "logits/chosen": -2.708482503890991, + "logits/rejected": -2.703840732574463, + "logps/chosen": -203.40245056152344, + "logps/rejected": -182.5862579345703, + "loss": 0.9505, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21254822611808777, + "rewards/margins": 0.05914415791630745, + "rewards/rejected": -0.2716923952102661, + "step": 5970 + }, + { + "epoch": 1.74, + "learning_rate": 2.2260227107243154e-07, + "logits/chosen": -2.7273995876312256, + "logits/rejected": -2.7421963214874268, + "logps/chosen": -190.72694396972656, + "logps/rejected": -188.5596923828125, + "loss": 0.9285, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.2123124897480011, + "rewards/margins": 0.06590452790260315, + "rewards/rejected": -0.27821698784828186, + "step": 5980 + }, + { + "epoch": 1.75, + "learning_rate": 2.2175892158622075e-07, + "logits/chosen": -2.7460885047912598, + "logits/rejected": -2.7639546394348145, + "logps/chosen": -198.0811767578125, + "logps/rejected": -194.3120880126953, + "loss": 0.9193, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.20485849678516388, + "rewards/margins": 0.12061629444360733, + "rewards/rejected": -0.3254747986793518, + "step": 5990 + }, + { + "epoch": 1.75, + "learning_rate": 2.209158975076601e-07, + "logits/chosen": -2.71342134475708, + "logits/rejected": -2.717883348464966, + "logps/chosen": -199.0811309814453, + "logps/rejected": -189.3131561279297, + "loss": 0.9218, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.2159605324268341, + "rewards/margins": 0.06497268378734589, + "rewards/rejected": -0.2809332013130188, + "step": 6000 + }, + { + "epoch": 1.75, + "eval_logits/chosen": -2.643186092376709, + "eval_logits/rejected": -2.6376824378967285, + "eval_logps/chosen": -197.700439453125, + "eval_logps/rejected": -183.7935028076172, + "eval_loss": 0.9321945905685425, + "eval_rewards/accuracies": 0.5786879658699036, + "eval_rewards/chosen": -0.20951364934444427, + "eval_rewards/margins": 0.07770907878875732, + "eval_rewards/rejected": -0.287222683429718, + "eval_runtime": 443.3821, + "eval_samples_per_second": 26.535, + "eval_steps_per_second": 3.318, + "step": 6000 + }, + { + "epoch": 1.75, + "learning_rate": 2.2007320855048941e-07, + "logits/chosen": -2.7461142539978027, + "logits/rejected": -2.731106996536255, + "logps/chosen": -196.85745239257812, + "logps/rejected": -179.5032501220703, + "loss": 0.9265, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2143532782793045, + "rewards/margins": 0.06113836169242859, + "rewards/rejected": -0.2754916250705719, + "step": 6010 + }, + { + "epoch": 1.76, + "learning_rate": 2.1923086442458701e-07, + "logits/chosen": -2.7388505935668945, + "logits/rejected": -2.7120919227600098, + "logps/chosen": -199.21661376953125, + "logps/rejected": -177.04495239257812, + "loss": 0.9123, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.18536700308322906, + "rewards/margins": 0.08062330633401871, + "rewards/rejected": -0.26599031686782837, + "step": 6020 + }, + { + "epoch": 1.76, + "learning_rate": 2.1838887483585773e-07, + "logits/chosen": -2.741877555847168, + "logits/rejected": -2.7501578330993652, + "logps/chosen": -189.5054931640625, + "logps/rejected": -181.48829650878906, + "loss": 0.9361, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22205829620361328, + "rewards/margins": 0.054470986127853394, + "rewards/rejected": -0.2765292823314667, + "step": 6030 + }, + { + "epoch": 1.76, + "learning_rate": 2.175472494861214e-07, + "logits/chosen": -2.741569757461548, + "logits/rejected": -2.7047781944274902, + "logps/chosen": -217.58944702148438, + "logps/rejected": -188.43344116210938, + "loss": 0.9283, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20472857356071472, + "rewards/margins": 0.07321996986865997, + "rewards/rejected": -0.2779485285282135, + "step": 6040 + }, + { + "epoch": 1.76, + "learning_rate": 2.1670599807300082e-07, + "logits/chosen": -2.7186341285705566, + "logits/rejected": -2.7301604747772217, + "logps/chosen": -196.19149780273438, + "logps/rejected": -194.16751098632812, + "loss": 0.9283, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.2274094521999359, + "rewards/margins": 0.05695997551083565, + "rewards/rejected": -0.28436940908432007, + "step": 6050 + }, + { + "epoch": 1.77, + "learning_rate": 2.158651302898103e-07, + "logits/chosen": -2.7257373332977295, + "logits/rejected": -2.724644184112549, + "logps/chosen": -194.86843872070312, + "logps/rejected": -180.59100341796875, + "loss": 0.9054, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.18583182990550995, + "rewards/margins": 0.10537779331207275, + "rewards/rejected": -0.2912096381187439, + "step": 6060 + }, + { + "epoch": 1.77, + "learning_rate": 2.1502465582544348e-07, + "logits/chosen": -2.722172975540161, + "logits/rejected": -2.74467134475708, + "logps/chosen": -190.9675750732422, + "logps/rejected": -196.57125854492188, + "loss": 0.9115, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.2208135426044464, + "rewards/margins": 0.0937967300415039, + "rewards/rejected": -0.3146103024482727, + "step": 6070 + }, + { + "epoch": 1.77, + "learning_rate": 2.1418458436426227e-07, + "logits/chosen": -2.7449307441711426, + "logits/rejected": -2.7329206466674805, + "logps/chosen": -209.1360626220703, + "logps/rejected": -187.81834411621094, + "loss": 0.8947, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.20917844772338867, + "rewards/margins": 0.10158956050872803, + "rewards/rejected": -0.3107679784297943, + "step": 6080 + }, + { + "epoch": 1.78, + "learning_rate": 2.1334492558598467e-07, + "logits/chosen": -2.725964307785034, + "logits/rejected": -2.7144277095794678, + "logps/chosen": -198.27955627441406, + "logps/rejected": -182.8297119140625, + "loss": 0.9358, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20703771710395813, + "rewards/margins": 0.0838475450873375, + "rewards/rejected": -0.2908852696418762, + "step": 6090 + }, + { + "epoch": 1.78, + "learning_rate": 2.1250568916557394e-07, + "logits/chosen": -2.711625337600708, + "logits/rejected": -2.675715208053589, + "logps/chosen": -194.16815185546875, + "logps/rejected": -163.89535522460938, + "loss": 0.944, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.23656761646270752, + "rewards/margins": 0.017660627141594887, + "rewards/rejected": -0.2542282044887543, + "step": 6100 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -2.660655975341797, + "eval_logits/rejected": -2.6554903984069824, + "eval_logps/chosen": -197.71176147460938, + "eval_logps/rejected": -183.81625366210938, + "eval_loss": 0.9318803548812866, + "eval_rewards/accuracies": 0.5822569727897644, + "eval_rewards/chosen": -0.2106441855430603, + "eval_rewards/margins": 0.07885365933179855, + "eval_rewards/rejected": -0.28949788212776184, + "eval_runtime": 443.3707, + "eval_samples_per_second": 26.535, + "eval_steps_per_second": 3.318, + "step": 6100 + }, + { + "epoch": 1.78, + "learning_rate": 2.1166688477312648e-07, + "logits/chosen": -2.7371768951416016, + "logits/rejected": -2.739062547683716, + "logps/chosen": -187.1644744873047, + "logps/rejected": -182.38119506835938, + "loss": 0.9194, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.23588745296001434, + "rewards/margins": 0.09019680321216583, + "rewards/rejected": -0.3260842561721802, + "step": 6110 + }, + { + "epoch": 1.79, + "learning_rate": 2.1082852207376056e-07, + "logits/chosen": -2.7473931312561035, + "logits/rejected": -2.745567798614502, + "logps/chosen": -193.79232788085938, + "logps/rejected": -191.493408203125, + "loss": 0.9054, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.20973005890846252, + "rewards/margins": 0.09154538810253143, + "rewards/rejected": -0.30127543210983276, + "step": 6120 + }, + { + "epoch": 1.79, + "learning_rate": 2.0999061072750527e-07, + "logits/chosen": -2.766697645187378, + "logits/rejected": -2.7617154121398926, + "logps/chosen": -204.7694854736328, + "logps/rejected": -187.0171661376953, + "loss": 0.9399, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.22126245498657227, + "rewards/margins": 0.0732002705335617, + "rewards/rejected": -0.2944626808166504, + "step": 6130 + }, + { + "epoch": 1.79, + "learning_rate": 2.091531603891888e-07, + "logits/chosen": -2.7624351978302, + "logits/rejected": -2.7651846408843994, + "logps/chosen": -225.45278930664062, + "logps/rejected": -211.8724822998047, + "loss": 0.9139, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2010863572359085, + "rewards/margins": 0.10098360478878021, + "rewards/rejected": -0.3020699620246887, + "step": 6140 + }, + { + "epoch": 1.79, + "learning_rate": 2.0831618070832756e-07, + "logits/chosen": -2.7233376502990723, + "logits/rejected": -2.733025074005127, + "logps/chosen": -178.06814575195312, + "logps/rejected": -194.65530395507812, + "loss": 0.9346, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23188039660453796, + "rewards/margins": 0.05588630586862564, + "rewards/rejected": -0.2877667248249054, + "step": 6150 + }, + { + "epoch": 1.8, + "learning_rate": 2.0747968132901455e-07, + "logits/chosen": -2.7171859741210938, + "logits/rejected": -2.7284810543060303, + "logps/chosen": -191.13034057617188, + "logps/rejected": -176.52029418945312, + "loss": 0.929, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.2055968940258026, + "rewards/margins": 0.1095929741859436, + "rewards/rejected": -0.3151898980140686, + "step": 6160 + }, + { + "epoch": 1.8, + "learning_rate": 2.066436718898089e-07, + "logits/chosen": -2.73824405670166, + "logits/rejected": -2.745525360107422, + "logps/chosen": -183.75559997558594, + "logps/rejected": -182.84054565429688, + "loss": 0.8989, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.1956663727760315, + "rewards/margins": 0.09247630089521408, + "rewards/rejected": -0.28814268112182617, + "step": 6170 + }, + { + "epoch": 1.8, + "learning_rate": 2.0580816202362393e-07, + "logits/chosen": -2.7328238487243652, + "logits/rejected": -2.727752685546875, + "logps/chosen": -198.67794799804688, + "logps/rejected": -198.55172729492188, + "loss": 0.9179, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.21394948661327362, + "rewards/margins": 0.07331383973360062, + "rewards/rejected": -0.28726333379745483, + "step": 6180 + }, + { + "epoch": 1.81, + "learning_rate": 2.0497316135761699e-07, + "logits/chosen": -2.7127814292907715, + "logits/rejected": -2.739947557449341, + "logps/chosen": -198.18446350097656, + "logps/rejected": -204.63540649414062, + "loss": 0.9277, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.20710286498069763, + "rewards/margins": 0.11847794055938721, + "rewards/rejected": -0.3255808353424072, + "step": 6190 + }, + { + "epoch": 1.81, + "learning_rate": 2.041386795130781e-07, + "logits/chosen": -2.71413516998291, + "logits/rejected": -2.7128262519836426, + "logps/chosen": -217.21572875976562, + "logps/rejected": -203.00741577148438, + "loss": 0.9037, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1752261370420456, + "rewards/margins": 0.11408879607915878, + "rewards/rejected": -0.28931495547294617, + "step": 6200 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -2.6512789726257324, + "eval_logits/rejected": -2.645932912826538, + "eval_logps/chosen": -197.71023559570312, + "eval_logps/rejected": -183.81349182128906, + "eval_loss": 0.9323169589042664, + "eval_rewards/accuracies": 0.5780081748962402, + "eval_rewards/chosen": -0.2104932814836502, + "eval_rewards/margins": 0.07872689515352249, + "eval_rewards/rejected": -0.2892201840877533, + "eval_runtime": 443.4189, + "eval_samples_per_second": 26.532, + "eval_steps_per_second": 3.317, + "step": 6200 + }, + { + "epoch": 1.81, + "learning_rate": 2.0330472610531904e-07, + "logits/chosen": -2.7172884941101074, + "logits/rejected": -2.714123249053955, + "logps/chosen": -202.11622619628906, + "logps/rejected": -190.3927001953125, + "loss": 0.9249, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.2016797512769699, + "rewards/margins": 0.10873542726039886, + "rewards/rejected": -0.3104151785373688, + "step": 6210 + }, + { + "epoch": 1.81, + "learning_rate": 2.0247131074356282e-07, + "logits/chosen": -2.719733476638794, + "logits/rejected": -2.744614362716675, + "logps/chosen": -203.67276000976562, + "logps/rejected": -190.30633544921875, + "loss": 0.9149, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.24380454421043396, + "rewards/margins": 0.08631626516580582, + "rewards/rejected": -0.3301208019256592, + "step": 6220 + }, + { + "epoch": 1.82, + "learning_rate": 2.016384430308327e-07, + "logits/chosen": -2.758436918258667, + "logits/rejected": -2.7610437870025635, + "logps/chosen": -192.81202697753906, + "logps/rejected": -178.98318481445312, + "loss": 0.9235, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.2031092643737793, + "rewards/margins": 0.08105708658695221, + "rewards/rejected": -0.2841663360595703, + "step": 6230 + }, + { + "epoch": 1.82, + "learning_rate": 2.0080613256384176e-07, + "logits/chosen": -2.709527015686035, + "logits/rejected": -2.7233712673187256, + "logps/chosen": -200.44332885742188, + "logps/rejected": -185.1295928955078, + "loss": 0.939, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.21839790046215057, + "rewards/margins": 0.06442956626415253, + "rewards/rejected": -0.2828274965286255, + "step": 6240 + }, + { + "epoch": 1.82, + "learning_rate": 1.9997438893288206e-07, + "logits/chosen": -2.7478461265563965, + "logits/rejected": -2.7245230674743652, + "logps/chosen": -203.77633666992188, + "logps/rejected": -185.59506225585938, + "loss": 0.9275, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2087053805589676, + "rewards/margins": 0.04750724509358406, + "rewards/rejected": -0.25621265172958374, + "step": 6250 + }, + { + "epoch": 1.83, + "learning_rate": 1.991432217217147e-07, + "logits/chosen": -2.74981951713562, + "logits/rejected": -2.730909585952759, + "logps/chosen": -208.09963989257812, + "logps/rejected": -191.15164184570312, + "loss": 0.9082, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21967999637126923, + "rewards/margins": 0.09564986824989319, + "rewards/rejected": -0.3153298795223236, + "step": 6260 + }, + { + "epoch": 1.83, + "learning_rate": 1.9831264050745831e-07, + "logits/chosen": -2.753127336502075, + "logits/rejected": -2.7233431339263916, + "logps/chosen": -206.69058227539062, + "logps/rejected": -189.58457946777344, + "loss": 0.946, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.24597451090812683, + "rewards/margins": 0.040549568831920624, + "rewards/rejected": -0.28652408719062805, + "step": 6270 + }, + { + "epoch": 1.83, + "learning_rate": 1.9748265486048003e-07, + "logits/chosen": -2.734738826751709, + "logits/rejected": -2.7540624141693115, + "logps/chosen": -190.40771484375, + "logps/rejected": -191.31874084472656, + "loss": 0.8927, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.17388156056404114, + "rewards/margins": 0.12894825637340546, + "rewards/rejected": -0.3028298020362854, + "step": 6280 + }, + { + "epoch": 1.83, + "learning_rate": 1.9665327434428424e-07, + "logits/chosen": -2.7537999153137207, + "logits/rejected": -2.7414052486419678, + "logps/chosen": -188.02294921875, + "logps/rejected": -189.0082550048828, + "loss": 0.928, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.22517235577106476, + "rewards/margins": 0.09064620733261108, + "rewards/rejected": -0.31581854820251465, + "step": 6290 + }, + { + "epoch": 1.84, + "learning_rate": 1.9582450851540278e-07, + "logits/chosen": -2.729881763458252, + "logits/rejected": -2.697326183319092, + "logps/chosen": -200.64572143554688, + "logps/rejected": -184.64097595214844, + "loss": 0.929, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.19603231549263, + "rewards/margins": 0.0747486799955368, + "rewards/rejected": -0.2707809805870056, + "step": 6300 + }, + { + "epoch": 1.84, + "eval_logits/chosen": -2.6499528884887695, + "eval_logits/rejected": -2.644568681716919, + "eval_logps/chosen": -197.71945190429688, + "eval_logps/rejected": -183.8264923095703, + "eval_loss": 0.9321011304855347, + "eval_rewards/accuracies": 0.5773283243179321, + "eval_rewards/chosen": -0.21141472458839417, + "eval_rewards/margins": 0.07910703122615814, + "eval_rewards/rejected": -0.2905217409133911, + "eval_runtime": 443.3682, + "eval_samples_per_second": 26.536, + "eval_steps_per_second": 3.318, + "step": 6300 + }, + { + "epoch": 1.84, + "learning_rate": 1.9499636692328477e-07, + "logits/chosen": -2.729348659515381, + "logits/rejected": -2.734265089035034, + "logps/chosen": -198.71798706054688, + "logps/rejected": -195.2779083251953, + "loss": 0.9228, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.24377970397472382, + "rewards/margins": 0.06538959592580795, + "rewards/rejected": -0.30916929244995117, + "step": 6310 + }, + { + "epoch": 1.84, + "learning_rate": 1.9416885911018648e-07, + "logits/chosen": -2.7466068267822266, + "logits/rejected": -2.7390589714050293, + "logps/chosen": -212.66635131835938, + "logps/rejected": -198.4500274658203, + "loss": 0.9558, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.2629418969154358, + "rewards/margins": 0.041651926934719086, + "rewards/rejected": -0.30459386110305786, + "step": 6320 + }, + { + "epoch": 1.85, + "learning_rate": 1.9334199461106165e-07, + "logits/chosen": -2.746856212615967, + "logits/rejected": -2.741549253463745, + "logps/chosen": -208.10165405273438, + "logps/rejected": -191.68942260742188, + "loss": 0.9406, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.20201078057289124, + "rewards/margins": 0.09430360794067383, + "rewards/rejected": -0.29631438851356506, + "step": 6330 + }, + { + "epoch": 1.85, + "learning_rate": 1.9251578295345113e-07, + "logits/chosen": -2.724238634109497, + "logits/rejected": -2.7157511711120605, + "logps/chosen": -202.83360290527344, + "logps/rejected": -182.34056091308594, + "loss": 0.9438, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.20443248748779297, + "rewards/margins": 0.05801036208868027, + "rewards/rejected": -0.26244282722473145, + "step": 6340 + }, + { + "epoch": 1.85, + "learning_rate": 1.9169023365737392e-07, + "logits/chosen": -2.766662120819092, + "logits/rejected": -2.779308319091797, + "logps/chosen": -196.58349609375, + "logps/rejected": -195.30563354492188, + "loss": 0.9444, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.23755855858325958, + "rewards/margins": 0.06650666147470474, + "rewards/rejected": -0.3040652573108673, + "step": 6350 + }, + { + "epoch": 1.86, + "learning_rate": 1.9086535623521626e-07, + "logits/chosen": -2.7374839782714844, + "logits/rejected": -2.723292112350464, + "logps/chosen": -210.5731964111328, + "logps/rejected": -171.655029296875, + "loss": 0.9246, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19213752448558807, + "rewards/margins": 0.1119011640548706, + "rewards/rejected": -0.3040386736392975, + "step": 6360 + }, + { + "epoch": 1.86, + "learning_rate": 1.900411601916234e-07, + "logits/chosen": -2.716977834701538, + "logits/rejected": -2.726975917816162, + "logps/chosen": -188.70948791503906, + "logps/rejected": -186.00656127929688, + "loss": 0.9243, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.22361302375793457, + "rewards/margins": 0.07853808254003525, + "rewards/rejected": -0.3021511137485504, + "step": 6370 + }, + { + "epoch": 1.86, + "learning_rate": 1.8921765502338905e-07, + "logits/chosen": -2.758565902709961, + "logits/rejected": -2.7778704166412354, + "logps/chosen": -205.9553680419922, + "logps/rejected": -211.300537109375, + "loss": 0.9231, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2435857355594635, + "rewards/margins": 0.08927027136087418, + "rewards/rejected": -0.33285602927207947, + "step": 6380 + }, + { + "epoch": 1.86, + "learning_rate": 1.8839485021934633e-07, + "logits/chosen": -2.7537710666656494, + "logits/rejected": -2.7532780170440674, + "logps/chosen": -191.62831115722656, + "logps/rejected": -187.5370330810547, + "loss": 0.938, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.2515445351600647, + "rewards/margins": 0.049747172743082047, + "rewards/rejected": -0.30129170417785645, + "step": 6390 + }, + { + "epoch": 1.87, + "learning_rate": 1.8757275526025857e-07, + "logits/chosen": -2.729072332382202, + "logits/rejected": -2.7408745288848877, + "logps/chosen": -200.15261840820312, + "logps/rejected": -198.6827392578125, + "loss": 0.9091, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.19985443353652954, + "rewards/margins": 0.10629860311746597, + "rewards/rejected": -0.3061530590057373, + "step": 6400 + }, + { + "epoch": 1.87, + "eval_logits/chosen": -2.658572196960449, + "eval_logits/rejected": -2.653358221054077, + "eval_logps/chosen": -197.71670532226562, + "eval_logps/rejected": -183.82521057128906, + "eval_loss": 0.9324252605438232, + "eval_rewards/accuracies": 0.5759687423706055, + "eval_rewards/chosen": -0.21114031970500946, + "eval_rewards/margins": 0.07925137132406235, + "eval_rewards/rejected": -0.2903916537761688, + "eval_runtime": 443.281, + "eval_samples_per_second": 26.541, + "eval_steps_per_second": 3.318, + "step": 6400 + }, + { + "epoch": 1.87, + "learning_rate": 1.8675137961870969e-07, + "logits/chosen": -2.696608781814575, + "logits/rejected": -2.721331834793091, + "logps/chosen": -169.80068969726562, + "logps/rejected": -181.3059539794922, + "loss": 0.9307, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.2106015682220459, + "rewards/margins": 0.05789356306195259, + "rewards/rejected": -0.2684951424598694, + "step": 6410 + }, + { + "epoch": 1.87, + "learning_rate": 1.8593073275899555e-07, + "logits/chosen": -2.7464489936828613, + "logits/rejected": -2.7503392696380615, + "logps/chosen": -198.9092559814453, + "logps/rejected": -187.02224731445312, + "loss": 0.9264, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.21138563752174377, + "rewards/margins": 0.0724065825343132, + "rewards/rejected": -0.2837921977043152, + "step": 6420 + }, + { + "epoch": 1.88, + "learning_rate": 1.851108241370143e-07, + "logits/chosen": -2.738217353820801, + "logits/rejected": -2.7316102981567383, + "logps/chosen": -193.5830841064453, + "logps/rejected": -185.67431640625, + "loss": 0.9206, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.19683505594730377, + "rewards/margins": 0.08236946910619736, + "rewards/rejected": -0.27920451760292053, + "step": 6430 + }, + { + "epoch": 1.88, + "learning_rate": 1.8429166320015816e-07, + "logits/chosen": -2.7444231510162354, + "logits/rejected": -2.710388660430908, + "logps/chosen": -199.27511596679688, + "logps/rejected": -178.7822265625, + "loss": 0.9369, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.222540020942688, + "rewards/margins": 0.03969267010688782, + "rewards/rejected": -0.2622326910495758, + "step": 6440 + }, + { + "epoch": 1.88, + "learning_rate": 1.834732593872037e-07, + "logits/chosen": -2.722276210784912, + "logits/rejected": -2.7346787452697754, + "logps/chosen": -194.71664428710938, + "logps/rejected": -189.44082641601562, + "loss": 0.9313, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.21700486540794373, + "rewards/margins": 0.0775427371263504, + "rewards/rejected": -0.2945476174354553, + "step": 6450 + }, + { + "epoch": 1.88, + "learning_rate": 1.826556221282039e-07, + "logits/chosen": -2.7683663368225098, + "logits/rejected": -2.77569317817688, + "logps/chosen": -194.61509704589844, + "logps/rejected": -184.4914093017578, + "loss": 0.9441, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.25148120522499084, + "rewards/margins": 0.08030495792627335, + "rewards/rejected": -0.3317861557006836, + "step": 6460 + }, + { + "epoch": 1.89, + "learning_rate": 1.81838760844379e-07, + "logits/chosen": -2.733006477355957, + "logits/rejected": -2.7150261402130127, + "logps/chosen": -211.0814666748047, + "logps/rejected": -180.62013244628906, + "loss": 0.9164, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.20144124329090118, + "rewards/margins": 0.08752738684415817, + "rewards/rejected": -0.28896862268447876, + "step": 6470 + }, + { + "epoch": 1.89, + "learning_rate": 1.81022684948008e-07, + "logits/chosen": -2.7351765632629395, + "logits/rejected": -2.7202677726745605, + "logps/chosen": -205.88320922851562, + "logps/rejected": -176.65267944335938, + "loss": 0.9175, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.19780614972114563, + "rewards/margins": 0.078305684030056, + "rewards/rejected": -0.27611178159713745, + "step": 6480 + }, + { + "epoch": 1.89, + "learning_rate": 1.8020740384232037e-07, + "logits/chosen": -2.729175329208374, + "logits/rejected": -2.7268869876861572, + "logps/chosen": -193.4375457763672, + "logps/rejected": -173.4163360595703, + "loss": 0.9309, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.21653446555137634, + "rewards/margins": 0.07103071361780167, + "rewards/rejected": -0.2875651717185974, + "step": 6490 + }, + { + "epoch": 1.9, + "learning_rate": 1.7939292692138753e-07, + "logits/chosen": -2.7279040813446045, + "logits/rejected": -2.752498149871826, + "logps/chosen": -202.0452423095703, + "logps/rejected": -195.004150390625, + "loss": 0.9094, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.2281557023525238, + "rewards/margins": 0.10120411962270737, + "rewards/rejected": -0.32935982942581177, + "step": 6500 + }, + { + "epoch": 1.9, + "eval_logits/chosen": -2.6529784202575684, + "eval_logits/rejected": -2.6476540565490723, + "eval_logps/chosen": -197.72872924804688, + "eval_logps/rejected": -183.82424926757812, + "eval_loss": 0.9320737719535828, + "eval_rewards/accuracies": 0.5769884586334229, + "eval_rewards/chosen": -0.21234209835529327, + "eval_rewards/margins": 0.0779537484049797, + "eval_rewards/rejected": -0.2902958393096924, + "eval_runtime": 443.3609, + "eval_samples_per_second": 26.536, + "eval_steps_per_second": 3.318, + "step": 6500 + }, + { + "epoch": 1.9, + "learning_rate": 1.785792635700148e-07, + "logits/chosen": -2.7295122146606445, + "logits/rejected": -2.732819080352783, + "logps/chosen": -205.765380859375, + "logps/rejected": -192.5811004638672, + "loss": 0.9115, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.24779972434043884, + "rewards/margins": 0.0829542875289917, + "rewards/rejected": -0.33075404167175293, + "step": 6510 + }, + { + "epoch": 1.9, + "learning_rate": 1.777664231636329e-07, + "logits/chosen": -2.7400591373443604, + "logits/rejected": -2.7352259159088135, + "logps/chosen": -198.49118041992188, + "logps/rejected": -179.13111877441406, + "loss": 0.9279, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2171938121318817, + "rewards/margins": 0.09275706857442856, + "rewards/rejected": -0.30995088815689087, + "step": 6520 + }, + { + "epoch": 1.9, + "learning_rate": 1.7695441506819058e-07, + "logits/chosen": -2.7267587184906006, + "logits/rejected": -2.732179641723633, + "logps/chosen": -195.60536193847656, + "logps/rejected": -188.46739196777344, + "loss": 0.9345, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.22522278130054474, + "rewards/margins": 0.073407843708992, + "rewards/rejected": -0.29863062500953674, + "step": 6530 + }, + { + "epoch": 1.91, + "learning_rate": 1.7614324864004604e-07, + "logits/chosen": -2.740993022918701, + "logits/rejected": -2.7591586112976074, + "logps/chosen": -207.13516235351562, + "logps/rejected": -204.57444763183594, + "loss": 0.915, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2111998349428177, + "rewards/margins": 0.0766361653804779, + "rewards/rejected": -0.2878360152244568, + "step": 6540 + }, + { + "epoch": 1.91, + "learning_rate": 1.753329332258593e-07, + "logits/chosen": -2.782977342605591, + "logits/rejected": -2.7586865425109863, + "logps/chosen": -210.08950805664062, + "logps/rejected": -190.40237426757812, + "loss": 0.9359, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.23711267113685608, + "rewards/margins": 0.053105391561985016, + "rewards/rejected": -0.2902180552482605, + "step": 6550 + }, + { + "epoch": 1.91, + "learning_rate": 1.7452347816248476e-07, + "logits/chosen": -2.734740972518921, + "logits/rejected": -2.7349419593811035, + "logps/chosen": -199.7351837158203, + "logps/rejected": -184.5938720703125, + "loss": 0.9084, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.22013111412525177, + "rewards/margins": 0.10203494131565094, + "rewards/rejected": -0.3221660256385803, + "step": 6560 + }, + { + "epoch": 1.92, + "learning_rate": 1.7371489277686325e-07, + "logits/chosen": -2.711773157119751, + "logits/rejected": -2.713780641555786, + "logps/chosen": -191.1211700439453, + "logps/rejected": -174.2381591796875, + "loss": 0.9119, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.16844011843204498, + "rewards/margins": 0.11107321083545685, + "rewards/rejected": -0.27951332926750183, + "step": 6570 + }, + { + "epoch": 1.92, + "learning_rate": 1.7290718638591492e-07, + "logits/chosen": -2.739809036254883, + "logits/rejected": -2.754399538040161, + "logps/chosen": -176.8489532470703, + "logps/rejected": -178.35150146484375, + "loss": 0.9403, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.23008951544761658, + "rewards/margins": 0.05764657258987427, + "rewards/rejected": -0.28773611783981323, + "step": 6580 + }, + { + "epoch": 1.92, + "learning_rate": 1.721003682964316e-07, + "logits/chosen": -2.7267158031463623, + "logits/rejected": -2.701935291290283, + "logps/chosen": -212.7582550048828, + "logps/rejected": -187.20980834960938, + "loss": 0.9176, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2046319991350174, + "rewards/margins": 0.0942133218050003, + "rewards/rejected": -0.2988453209400177, + "step": 6590 + }, + { + "epoch": 1.93, + "learning_rate": 1.7129444780496972e-07, + "logits/chosen": -2.74074649810791, + "logits/rejected": -2.746044874191284, + "logps/chosen": -202.53713989257812, + "logps/rejected": -191.43814086914062, + "loss": 0.9449, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.22108259797096252, + "rewards/margins": 0.07247807830572128, + "rewards/rejected": -0.2935606837272644, + "step": 6600 + }, + { + "epoch": 1.93, + "eval_logits/chosen": -2.6358284950256348, + "eval_logits/rejected": -2.6301984786987305, + "eval_logps/chosen": -197.7251434326172, + "eval_logps/rejected": -183.8246307373047, + "eval_loss": 0.932049036026001, + "eval_rewards/accuracies": 0.5795377492904663, + "eval_rewards/chosen": -0.2119826078414917, + "eval_rewards/margins": 0.07835091650485992, + "eval_rewards/rejected": -0.29033350944519043, + "eval_runtime": 443.2884, + "eval_samples_per_second": 26.54, + "eval_steps_per_second": 3.318, + "step": 6600 + }, + { + "epoch": 1.93, + "learning_rate": 1.7048943419774307e-07, + "logits/chosen": -2.747640609741211, + "logits/rejected": -2.730374813079834, + "logps/chosen": -203.86410522460938, + "logps/rejected": -194.0001983642578, + "loss": 0.9269, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.22307124733924866, + "rewards/margins": 0.0955260619521141, + "rewards/rejected": -0.31859731674194336, + "step": 6610 + }, + { + "epoch": 1.93, + "learning_rate": 1.69685336750516e-07, + "logits/chosen": -2.760178327560425, + "logits/rejected": -2.7309234142303467, + "logps/chosen": -221.4056396484375, + "logps/rejected": -194.12762451171875, + "loss": 0.9376, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2214629203081131, + "rewards/margins": 0.06823597103357315, + "rewards/rejected": -0.28969889879226685, + "step": 6620 + }, + { + "epoch": 1.93, + "learning_rate": 1.6888216472849638e-07, + "logits/chosen": -2.710864305496216, + "logits/rejected": -2.733487129211426, + "logps/chosen": -161.94752502441406, + "logps/rejected": -181.10289001464844, + "loss": 0.9176, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.22365054488182068, + "rewards/margins": 0.06910266727209091, + "rewards/rejected": -0.2927531898021698, + "step": 6630 + }, + { + "epoch": 1.94, + "learning_rate": 1.6807992738622871e-07, + "logits/chosen": -2.7660770416259766, + "logits/rejected": -2.7430419921875, + "logps/chosen": -223.51760864257812, + "logps/rejected": -193.828857421875, + "loss": 0.9266, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21513095498085022, + "rewards/margins": 0.09501657634973526, + "rewards/rejected": -0.3101475238800049, + "step": 6640 + }, + { + "epoch": 1.94, + "learning_rate": 1.6727863396748793e-07, + "logits/chosen": -2.7407381534576416, + "logits/rejected": -2.746407985687256, + "logps/chosen": -215.10440063476562, + "logps/rejected": -196.6473846435547, + "loss": 0.9003, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.19051149487495422, + "rewards/margins": 0.14527949690818787, + "rewards/rejected": -0.3357909619808197, + "step": 6650 + }, + { + "epoch": 1.94, + "learning_rate": 1.6647829370517231e-07, + "logits/chosen": -2.7251319885253906, + "logits/rejected": -2.717182159423828, + "logps/chosen": -186.213134765625, + "logps/rejected": -174.87966918945312, + "loss": 0.9504, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.24467504024505615, + "rewards/margins": 0.03715928643941879, + "rewards/rejected": -0.28183430433273315, + "step": 6660 + }, + { + "epoch": 1.95, + "learning_rate": 1.6567891582119764e-07, + "logits/chosen": -2.729135513305664, + "logits/rejected": -2.7299442291259766, + "logps/chosen": -185.3548583984375, + "logps/rejected": -179.4267578125, + "loss": 0.9416, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.2128109484910965, + "rewards/margins": 0.05870833247900009, + "rewards/rejected": -0.2715193033218384, + "step": 6670 + }, + { + "epoch": 1.95, + "learning_rate": 1.6488050952639056e-07, + "logits/chosen": -2.7212636470794678, + "logits/rejected": -2.7113900184631348, + "logps/chosen": -210.2383270263672, + "logps/rejected": -193.41522216796875, + "loss": 0.9143, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.226904034614563, + "rewards/margins": 0.10393612086772919, + "rewards/rejected": -0.33084017038345337, + "step": 6680 + }, + { + "epoch": 1.95, + "learning_rate": 1.6408308402038267e-07, + "logits/chosen": -2.7432644367218018, + "logits/rejected": -2.74440598487854, + "logps/chosen": -202.8652801513672, + "logps/rejected": -188.2684326171875, + "loss": 0.9243, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.22147603332996368, + "rewards/margins": 0.05875271558761597, + "rewards/rejected": -0.28022870421409607, + "step": 6690 + }, + { + "epoch": 1.95, + "learning_rate": 1.632866484915043e-07, + "logits/chosen": -2.768315553665161, + "logits/rejected": -2.7549726963043213, + "logps/chosen": -210.89108276367188, + "logps/rejected": -196.1284637451172, + "loss": 0.9404, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.22763648629188538, + "rewards/margins": 0.08506520837545395, + "rewards/rejected": -0.3127017021179199, + "step": 6700 + }, + { + "epoch": 1.95, + "eval_logits/chosen": -2.6496846675872803, + "eval_logits/rejected": -2.644331455230713, + "eval_logps/chosen": -197.72042846679688, + "eval_logps/rejected": -183.8302001953125, + "eval_loss": 0.9318643808364868, + "eval_rewards/accuracies": 0.5802175402641296, + "eval_rewards/chosen": -0.21151039004325867, + "eval_rewards/margins": 0.07937860488891602, + "eval_rewards/rejected": -0.29088902473449707, + "eval_runtime": 443.3291, + "eval_samples_per_second": 26.538, + "eval_steps_per_second": 3.318, + "step": 6700 + }, + { + "epoch": 1.96, + "learning_rate": 1.62491212116679e-07, + "logits/chosen": -2.7260818481445312, + "logits/rejected": -2.7126994132995605, + "logps/chosen": -191.04051208496094, + "logps/rejected": -170.42739868164062, + "loss": 0.9243, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.21634069085121155, + "rewards/margins": 0.0933542475104332, + "rewards/rejected": -0.30969494581222534, + "step": 6710 + }, + { + "epoch": 1.96, + "learning_rate": 1.616967840613175e-07, + "logits/chosen": -2.744236946105957, + "logits/rejected": -2.7506120204925537, + "logps/chosen": -191.96136474609375, + "logps/rejected": -186.02713012695312, + "loss": 0.9111, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.19615641236305237, + "rewards/margins": 0.10042214393615723, + "rewards/rejected": -0.2965785562992096, + "step": 6720 + }, + { + "epoch": 1.96, + "learning_rate": 1.609033734792119e-07, + "logits/chosen": -2.7435319423675537, + "logits/rejected": -2.7113170623779297, + "logps/chosen": -198.1565399169922, + "logps/rejected": -166.88198852539062, + "loss": 0.9242, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2112916260957718, + "rewards/margins": 0.07510940730571747, + "rewards/rejected": -0.28640103340148926, + "step": 6730 + }, + { + "epoch": 1.97, + "learning_rate": 1.6011098951243093e-07, + "logits/chosen": -2.7205939292907715, + "logits/rejected": -2.7266383171081543, + "logps/chosen": -180.89730834960938, + "logps/rejected": -172.36257934570312, + "loss": 0.923, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21849095821380615, + "rewards/margins": 0.07519636303186417, + "rewards/rejected": -0.2936873435974121, + "step": 6740 + }, + { + "epoch": 1.97, + "learning_rate": 1.5931964129121376e-07, + "logits/chosen": -2.7276265621185303, + "logits/rejected": -2.7258830070495605, + "logps/chosen": -189.77908325195312, + "logps/rejected": -181.21559143066406, + "loss": 0.9416, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.22878125309944153, + "rewards/margins": 0.06122662499547005, + "rewards/rejected": -0.29000788927078247, + "step": 6750 + }, + { + "epoch": 1.97, + "learning_rate": 1.5852933793386543e-07, + "logits/chosen": -2.7102389335632324, + "logits/rejected": -2.730618476867676, + "logps/chosen": -180.26690673828125, + "logps/rejected": -180.7349395751953, + "loss": 0.9272, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.22268176078796387, + "rewards/margins": 0.07356902211904526, + "rewards/rejected": -0.2962507903575897, + "step": 6760 + }, + { + "epoch": 1.97, + "learning_rate": 1.5774008854665136e-07, + "logits/chosen": -2.73473858833313, + "logits/rejected": -2.7441117763519287, + "logps/chosen": -187.04324340820312, + "logps/rejected": -191.31651306152344, + "loss": 0.9105, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.20239019393920898, + "rewards/margins": 0.10861504077911377, + "rewards/rejected": -0.31100520491600037, + "step": 6770 + }, + { + "epoch": 1.98, + "learning_rate": 1.569519022236928e-07, + "logits/chosen": -2.732327699661255, + "logits/rejected": -2.726734161376953, + "logps/chosen": -196.94210815429688, + "logps/rejected": -190.38333129882812, + "loss": 0.9269, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.21858474612236023, + "rewards/margins": 0.06861617416143417, + "rewards/rejected": -0.2872008979320526, + "step": 6780 + }, + { + "epoch": 1.98, + "learning_rate": 1.5616478804686162e-07, + "logits/chosen": -2.739928722381592, + "logits/rejected": -2.7340385913848877, + "logps/chosen": -197.3428955078125, + "logps/rejected": -181.82748413085938, + "loss": 0.9276, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.23716449737548828, + "rewards/margins": 0.06081245467066765, + "rewards/rejected": -0.29797694087028503, + "step": 6790 + }, + { + "epoch": 1.98, + "learning_rate": 1.5537875508567618e-07, + "logits/chosen": -2.7353885173797607, + "logits/rejected": -2.737687826156616, + "logps/chosen": -183.25909423828125, + "logps/rejected": -175.84182739257812, + "loss": 0.9155, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.1979648917913437, + "rewards/margins": 0.08871322125196457, + "rewards/rejected": -0.2866780757904053, + "step": 6800 + }, + { + "epoch": 1.98, + "eval_logits/chosen": -2.6362035274505615, + "eval_logits/rejected": -2.6305882930755615, + "eval_logps/chosen": -197.72909545898438, + "eval_logps/rejected": -183.84056091308594, + "eval_loss": 0.9314205050468445, + "eval_rewards/accuracies": 0.5825968980789185, + "eval_rewards/chosen": -0.21238040924072266, + "eval_rewards/margins": 0.07954783737659454, + "eval_rewards/rejected": -0.2919282615184784, + "eval_runtime": 443.3072, + "eval_samples_per_second": 26.539, + "eval_steps_per_second": 3.318, + "step": 6800 + }, + { + "epoch": 1.99, + "learning_rate": 1.5459381239719637e-07, + "logits/chosen": -2.7328858375549316, + "logits/rejected": -2.7300026416778564, + "logps/chosen": -192.85726928710938, + "logps/rejected": -182.21939086914062, + "loss": 0.9299, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.24222655594348907, + "rewards/margins": 0.06386517733335495, + "rewards/rejected": -0.3060917258262634, + "step": 6810 + }, + { + "epoch": 1.99, + "learning_rate": 1.538099690259193e-07, + "logits/chosen": -2.7482893466949463, + "logits/rejected": -2.774064540863037, + "logps/chosen": -198.48825073242188, + "logps/rejected": -204.14930725097656, + "loss": 0.9155, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.21093496680259705, + "rewards/margins": 0.10215537250041962, + "rewards/rejected": -0.31309035420417786, + "step": 6820 + }, + { + "epoch": 1.99, + "learning_rate": 1.5302723400367547e-07, + "logits/chosen": -2.7298903465270996, + "logits/rejected": -2.720515251159668, + "logps/chosen": -204.21322631835938, + "logps/rejected": -180.675537109375, + "loss": 0.9181, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1895557940006256, + "rewards/margins": 0.08064098656177521, + "rewards/rejected": -0.270196795463562, + "step": 6830 + }, + { + "epoch": 2.0, + "learning_rate": 1.522456163495242e-07, + "logits/chosen": -2.721670627593994, + "logits/rejected": -2.714440107345581, + "logps/chosen": -203.53138732910156, + "logps/rejected": -188.99951171875, + "loss": 0.9137, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.21865499019622803, + "rewards/margins": 0.1239236369729042, + "rewards/rejected": -0.34257861971855164, + "step": 6840 + }, + { + "epoch": 2.0, + "learning_rate": 1.5146512506965013e-07, + "logits/chosen": -2.751051187515259, + "logits/rejected": -2.725895643234253, + "logps/chosen": -192.48666381835938, + "logps/rejected": -174.069580078125, + "loss": 0.9222, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23686334490776062, + "rewards/margins": 0.06425967812538147, + "rewards/rejected": -0.3011230528354645, + "step": 6850 + }, + { + "epoch": 2.0, + "learning_rate": 1.5068576915725912e-07, + "logits/chosen": -2.752620220184326, + "logits/rejected": -2.7401299476623535, + "logps/chosen": -217.71902465820312, + "logps/rejected": -194.8961181640625, + "loss": 0.9327, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.20294050872325897, + "rewards/margins": 0.08372064679861069, + "rewards/rejected": -0.28666117787361145, + "step": 6860 + }, + { + "epoch": 2.0, + "learning_rate": 1.4990755759247485e-07, + "logits/chosen": -2.7282445430755615, + "logits/rejected": -2.7199878692626953, + "logps/chosen": -201.61978149414062, + "logps/rejected": -178.6517791748047, + "loss": 0.9403, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.21178099513053894, + "rewards/margins": 0.09103866666555405, + "rewards/rejected": -0.3028196692466736, + "step": 6870 + }, + { + "epoch": 2.01, + "learning_rate": 1.4913049934223516e-07, + "logits/chosen": -2.7153429985046387, + "logits/rejected": -2.7482893466949463, + "logps/chosen": -197.48809814453125, + "logps/rejected": -202.34713745117188, + "loss": 0.9031, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.20829913020133972, + "rewards/margins": 0.09428951889276505, + "rewards/rejected": -0.30258864164352417, + "step": 6880 + }, + { + "epoch": 2.01, + "learning_rate": 1.4835460336018895e-07, + "logits/chosen": -2.7747058868408203, + "logits/rejected": -2.7874245643615723, + "logps/chosen": -212.27536010742188, + "logps/rejected": -207.9674072265625, + "loss": 0.8989, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2062179148197174, + "rewards/margins": 0.13777431845664978, + "rewards/rejected": -0.3439922332763672, + "step": 6890 + }, + { + "epoch": 2.01, + "learning_rate": 1.4757987858659296e-07, + "logits/chosen": -2.7338640689849854, + "logits/rejected": -2.7241692543029785, + "logps/chosen": -202.30865478515625, + "logps/rejected": -183.61050415039062, + "loss": 0.9328, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.21838633716106415, + "rewards/margins": 0.0648864209651947, + "rewards/rejected": -0.28327274322509766, + "step": 6900 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -2.6352241039276123, + "eval_logits/rejected": -2.6295835971832275, + "eval_logps/chosen": -197.7320556640625, + "eval_logps/rejected": -183.84559631347656, + "eval_loss": 0.9312693476676941, + "eval_rewards/accuracies": 0.5883752703666687, + "eval_rewards/chosen": -0.212674081325531, + "eval_rewards/margins": 0.07975810021162033, + "eval_rewards/rejected": -0.29243215918540955, + "eval_runtime": 443.3007, + "eval_samples_per_second": 26.54, + "eval_steps_per_second": 3.318, + "step": 6900 + }, + { + "epoch": 2.02, + "learning_rate": 1.468063339482084e-07, + "logits/chosen": -2.70162034034729, + "logits/rejected": -2.7118117809295654, + "logps/chosen": -176.92330932617188, + "logps/rejected": -171.45474243164062, + "loss": 0.9329, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.24294695258140564, + "rewards/margins": 0.07506359368562698, + "rewards/rejected": -0.3180105686187744, + "step": 6910 + }, + { + "epoch": 2.02, + "learning_rate": 1.4603397835819864e-07, + "logits/chosen": -2.748415946960449, + "logits/rejected": -2.758174419403076, + "logps/chosen": -215.23764038085938, + "logps/rejected": -211.4716339111328, + "loss": 0.9061, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.21699877083301544, + "rewards/margins": 0.11743853241205215, + "rewards/rejected": -0.3344372808933258, + "step": 6920 + }, + { + "epoch": 2.02, + "learning_rate": 1.4526282071602602e-07, + "logits/chosen": -2.735396146774292, + "logits/rejected": -2.740246295928955, + "logps/chosen": -190.8970947265625, + "logps/rejected": -185.86720275878906, + "loss": 0.9009, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.19201794266700745, + "rewards/margins": 0.09687992185354233, + "rewards/rejected": -0.288897842168808, + "step": 6930 + }, + { + "epoch": 2.02, + "learning_rate": 1.4449286990734992e-07, + "logits/chosen": -2.7192721366882324, + "logits/rejected": -2.7274882793426514, + "logps/chosen": -193.3112030029297, + "logps/rejected": -187.61007690429688, + "loss": 0.9233, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2464873343706131, + "rewards/margins": 0.07503970712423325, + "rewards/rejected": -0.32152706384658813, + "step": 6940 + }, + { + "epoch": 2.03, + "learning_rate": 1.437241348039236e-07, + "logits/chosen": -2.7276501655578613, + "logits/rejected": -2.774181365966797, + "logps/chosen": -197.47341918945312, + "logps/rejected": -204.94204711914062, + "loss": 0.8835, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.23753318190574646, + "rewards/margins": 0.11532554775476456, + "rewards/rejected": -0.3528587222099304, + "step": 6950 + }, + { + "epoch": 2.03, + "learning_rate": 1.4295662426349253e-07, + "logits/chosen": -2.7351794242858887, + "logits/rejected": -2.7332804203033447, + "logps/chosen": -174.01608276367188, + "logps/rejected": -174.34007263183594, + "loss": 0.9153, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.20454581081867218, + "rewards/margins": 0.08692500740289688, + "rewards/rejected": -0.29147082567214966, + "step": 6960 + }, + { + "epoch": 2.03, + "learning_rate": 1.4219034712969196e-07, + "logits/chosen": -2.721445083618164, + "logits/rejected": -2.7228879928588867, + "logps/chosen": -201.0042724609375, + "logps/rejected": -187.95651245117188, + "loss": 0.9163, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2132536619901657, + "rewards/margins": 0.07119356095790863, + "rewards/rejected": -0.28444722294807434, + "step": 6970 + }, + { + "epoch": 2.04, + "learning_rate": 1.414253122319457e-07, + "logits/chosen": -2.7350051403045654, + "logits/rejected": -2.7252516746520996, + "logps/chosen": -204.6690673828125, + "logps/rejected": -186.09927368164062, + "loss": 0.913, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.20454342663288116, + "rewards/margins": 0.09902454912662506, + "rewards/rejected": -0.30356794595718384, + "step": 6980 + }, + { + "epoch": 2.04, + "learning_rate": 1.4066152838536344e-07, + "logits/chosen": -2.734398603439331, + "logits/rejected": -2.72821307182312, + "logps/chosen": -195.73269653320312, + "logps/rejected": -177.6675567626953, + "loss": 0.9175, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.23298010230064392, + "rewards/margins": 0.08028466999530792, + "rewards/rejected": -0.31326478719711304, + "step": 6990 + }, + { + "epoch": 2.04, + "learning_rate": 1.398990043906397e-07, + "logits/chosen": -2.723540782928467, + "logits/rejected": -2.7197868824005127, + "logps/chosen": -187.7908172607422, + "logps/rejected": -179.2284393310547, + "loss": 0.9012, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.23153071105480194, + "rewards/margins": 0.0925595834851265, + "rewards/rejected": -0.32409027218818665, + "step": 7000 + }, + { + "epoch": 2.04, + "eval_logits/chosen": -2.6416141986846924, + "eval_logits/rejected": -2.6360816955566406, + "eval_logps/chosen": -197.75148010253906, + "eval_logps/rejected": -183.8529510498047, + "eval_loss": 0.9321034550666809, + "eval_rewards/accuracies": 0.5766485333442688, + "eval_rewards/chosen": -0.21461807191371918, + "eval_rewards/margins": 0.07854754477739334, + "eval_rewards/rejected": -0.2931656241416931, + "eval_runtime": 443.3078, + "eval_samples_per_second": 26.539, + "eval_steps_per_second": 3.318, + "step": 7000 + }, + { + "epoch": 2.04, + "learning_rate": 1.3913774903395268e-07, + "logits/chosen": -2.7158100605010986, + "logits/rejected": -2.7378973960876465, + "logps/chosen": -169.9608612060547, + "logps/rejected": -175.0387420654297, + "loss": 0.9392, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.2130027711391449, + "rewards/margins": 0.06254032254219055, + "rewards/rejected": -0.27554312348365784, + "step": 7010 + }, + { + "epoch": 2.05, + "learning_rate": 1.3837777108686226e-07, + "logits/chosen": -2.7293105125427246, + "logits/rejected": -2.7177674770355225, + "logps/chosen": -196.72335815429688, + "logps/rejected": -182.2796173095703, + "loss": 0.9184, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.22791095077991486, + "rewards/margins": 0.0950903668999672, + "rewards/rejected": -0.32300129532814026, + "step": 7020 + }, + { + "epoch": 2.05, + "learning_rate": 1.376190793062098e-07, + "logits/chosen": -2.7231078147888184, + "logits/rejected": -2.741497755050659, + "logps/chosen": -177.41726684570312, + "logps/rejected": -179.03158569335938, + "loss": 0.9427, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.228541761636734, + "rewards/margins": 0.04261890798807144, + "rewards/rejected": -0.27116066217422485, + "step": 7030 + }, + { + "epoch": 2.05, + "learning_rate": 1.3686168243401657e-07, + "logits/chosen": -2.7197694778442383, + "logits/rejected": -2.726256847381592, + "logps/chosen": -209.73495483398438, + "logps/rejected": -202.67848205566406, + "loss": 0.9301, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20739248394966125, + "rewards/margins": 0.10595047473907471, + "rewards/rejected": -0.31334295868873596, + "step": 7040 + }, + { + "epoch": 2.06, + "learning_rate": 1.361055891973833e-07, + "logits/chosen": -2.7275664806365967, + "logits/rejected": -2.7401270866394043, + "logps/chosen": -191.7522735595703, + "logps/rejected": -185.98114013671875, + "loss": 0.9006, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.18320171535015106, + "rewards/margins": 0.13302107155323029, + "rewards/rejected": -0.31622275710105896, + "step": 7050 + }, + { + "epoch": 2.06, + "learning_rate": 1.353508083083895e-07, + "logits/chosen": -2.7291016578674316, + "logits/rejected": -2.7423441410064697, + "logps/chosen": -186.4053497314453, + "logps/rejected": -173.64749145507812, + "loss": 0.9185, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.21921300888061523, + "rewards/margins": 0.08114122599363327, + "rewards/rejected": -0.3003542423248291, + "step": 7060 + }, + { + "epoch": 2.06, + "learning_rate": 1.3459734846399341e-07, + "logits/chosen": -2.733018398284912, + "logits/rejected": -2.741650104522705, + "logps/chosen": -202.03048706054688, + "logps/rejected": -192.7786865234375, + "loss": 0.931, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.20718999207019806, + "rewards/margins": 0.09407447278499603, + "rewards/rejected": -0.3012644648551941, + "step": 7070 + }, + { + "epoch": 2.07, + "learning_rate": 1.3384521834593137e-07, + "logits/chosen": -2.7663657665252686, + "logits/rejected": -2.7430965900421143, + "logps/chosen": -212.329833984375, + "logps/rejected": -184.7091827392578, + "loss": 0.9236, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.23409593105316162, + "rewards/margins": 0.09491056948900223, + "rewards/rejected": -0.32900652289390564, + "step": 7080 + }, + { + "epoch": 2.07, + "learning_rate": 1.3309442662061786e-07, + "logits/chosen": -2.731829881668091, + "logits/rejected": -2.7377142906188965, + "logps/chosen": -199.4453125, + "logps/rejected": -174.1707000732422, + "loss": 0.9292, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.20361220836639404, + "rewards/margins": 0.08883138000965118, + "rewards/rejected": -0.2924436032772064, + "step": 7090 + }, + { + "epoch": 2.07, + "learning_rate": 1.3234498193904608e-07, + "logits/chosen": -2.7473537921905518, + "logits/rejected": -2.765782594680786, + "logps/chosen": -187.45675659179688, + "logps/rejected": -183.682373046875, + "loss": 0.9296, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20840564370155334, + "rewards/margins": 0.08556272089481354, + "rewards/rejected": -0.2939683794975281, + "step": 7100 + }, + { + "epoch": 2.07, + "eval_logits/chosen": -2.651341199874878, + "eval_logits/rejected": -2.6459922790527344, + "eval_logps/chosen": -197.7322540283203, + "eval_logps/rejected": -183.85069274902344, + "eval_loss": 0.9314108490943909, + "eval_rewards/accuracies": 0.5780081748962402, + "eval_rewards/chosen": -0.21269488334655762, + "eval_rewards/margins": 0.08024676889181137, + "eval_rewards/rejected": -0.2929416596889496, + "eval_runtime": 443.2974, + "eval_samples_per_second": 26.54, + "eval_steps_per_second": 3.318, + "step": 7100 + }, + { + "epoch": 2.07, + "learning_rate": 1.3159689293668748e-07, + "logits/chosen": -2.751176357269287, + "logits/rejected": -2.738022565841675, + "logps/chosen": -198.71163940429688, + "logps/rejected": -175.29800415039062, + "loss": 0.9342, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.2539035975933075, + "rewards/margins": 0.04478713870048523, + "rewards/rejected": -0.2986907362937927, + "step": 7110 + }, + { + "epoch": 2.08, + "learning_rate": 1.308501682333931e-07, + "logits/chosen": -2.730717897415161, + "logits/rejected": -2.731663703918457, + "logps/chosen": -189.55604553222656, + "logps/rejected": -179.62344360351562, + "loss": 0.9256, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.20553156733512878, + "rewards/margins": 0.07954994589090347, + "rewards/rejected": -0.28508150577545166, + "step": 7120 + }, + { + "epoch": 2.08, + "learning_rate": 1.3010481643329368e-07, + "logits/chosen": -2.7590889930725098, + "logits/rejected": -2.7237637042999268, + "logps/chosen": -172.84356689453125, + "logps/rejected": -149.15286254882812, + "loss": 0.9285, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.21032579243183136, + "rewards/margins": 0.07369138300418854, + "rewards/rejected": -0.2840171456336975, + "step": 7130 + }, + { + "epoch": 2.08, + "learning_rate": 1.2936084612470065e-07, + "logits/chosen": -2.763084888458252, + "logits/rejected": -2.7535722255706787, + "logps/chosen": -205.8310546875, + "logps/rejected": -190.03878784179688, + "loss": 0.9197, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2162288874387741, + "rewards/margins": 0.08808388561010361, + "rewards/rejected": -0.3043127655982971, + "step": 7140 + }, + { + "epoch": 2.09, + "learning_rate": 1.2861826588000712e-07, + "logits/chosen": -2.7227349281311035, + "logits/rejected": -2.743065357208252, + "logps/chosen": -190.4771270751953, + "logps/rejected": -186.38426208496094, + "loss": 0.9075, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.22551707923412323, + "rewards/margins": 0.08020542562007904, + "rewards/rejected": -0.30572250485420227, + "step": 7150 + }, + { + "epoch": 2.09, + "learning_rate": 1.278770842555896e-07, + "logits/chosen": -2.731098175048828, + "logits/rejected": -2.719924211502075, + "logps/chosen": -188.09976196289062, + "logps/rejected": -171.93310546875, + "loss": 0.9163, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20485873520374298, + "rewards/margins": 0.07870732247829437, + "rewards/rejected": -0.28356605768203735, + "step": 7160 + }, + { + "epoch": 2.09, + "learning_rate": 1.271373097917086e-07, + "logits/chosen": -2.7509634494781494, + "logits/rejected": -2.7207791805267334, + "logps/chosen": -204.354248046875, + "logps/rejected": -189.40679931640625, + "loss": 0.9383, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.21421301364898682, + "rewards/margins": 0.05646609514951706, + "rewards/rejected": -0.2706790864467621, + "step": 7170 + }, + { + "epoch": 2.09, + "learning_rate": 1.2639895101241073e-07, + "logits/chosen": -2.770415782928467, + "logits/rejected": -2.744168758392334, + "logps/chosen": -204.50999450683594, + "logps/rejected": -180.71023559570312, + "loss": 0.9295, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.20703864097595215, + "rewards/margins": 0.07825209945440292, + "rewards/rejected": -0.2852907180786133, + "step": 7180 + }, + { + "epoch": 2.1, + "learning_rate": 1.2566201642543054e-07, + "logits/chosen": -2.7546815872192383, + "logits/rejected": -2.7181484699249268, + "logps/chosen": -205.6988067626953, + "logps/rejected": -174.81179809570312, + "loss": 0.907, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19236141443252563, + "rewards/margins": 0.09385807067155838, + "rewards/rejected": -0.2862195074558258, + "step": 7190 + }, + { + "epoch": 2.1, + "learning_rate": 1.249265145220921e-07, + "logits/chosen": -2.725691318511963, + "logits/rejected": -2.7101643085479736, + "logps/chosen": -198.37322998046875, + "logps/rejected": -186.52853393554688, + "loss": 0.9076, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.24150443077087402, + "rewards/margins": 0.11702193319797516, + "rewards/rejected": -0.358526349067688, + "step": 7200 + }, + { + "epoch": 2.1, + "eval_logits/chosen": -2.65535831451416, + "eval_logits/rejected": -2.6500964164733887, + "eval_logps/chosen": -197.7506866455078, + "eval_logps/rejected": -183.865966796875, + "eval_loss": 0.9315347075462341, + "eval_rewards/accuracies": 0.579707682132721, + "eval_rewards/chosen": -0.21453608572483063, + "eval_rewards/margins": 0.07993295043706894, + "eval_rewards/rejected": -0.29446902871131897, + "eval_runtime": 443.2574, + "eval_samples_per_second": 26.542, + "eval_steps_per_second": 3.319, + "step": 7200 + }, + { + "epoch": 2.1, + "learning_rate": 1.2419245377721166e-07, + "logits/chosen": -2.744749069213867, + "logits/rejected": -2.7440600395202637, + "logps/chosen": -192.15850830078125, + "logps/rejected": -178.24766540527344, + "loss": 0.9182, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.2116578072309494, + "rewards/margins": 0.06727829575538635, + "rewards/rejected": -0.27893608808517456, + "step": 7210 + }, + { + "epoch": 2.11, + "learning_rate": 1.2345984264899953e-07, + "logits/chosen": -2.744431257247925, + "logits/rejected": -2.7419965267181396, + "logps/chosen": -204.25119018554688, + "logps/rejected": -192.86651611328125, + "loss": 0.9103, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.19741736352443695, + "rewards/margins": 0.08304335176944733, + "rewards/rejected": -0.2804606854915619, + "step": 7220 + }, + { + "epoch": 2.11, + "learning_rate": 1.2272868957896285e-07, + "logits/chosen": -2.735452890396118, + "logits/rejected": -2.7617390155792236, + "logps/chosen": -204.0012969970703, + "logps/rejected": -200.9781951904297, + "loss": 0.9228, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.20529159903526306, + "rewards/margins": 0.10176429897546768, + "rewards/rejected": -0.30705589056015015, + "step": 7230 + }, + { + "epoch": 2.11, + "learning_rate": 1.2199900299180823e-07, + "logits/chosen": -2.722921848297119, + "logits/rejected": -2.7459399700164795, + "logps/chosen": -193.969482421875, + "logps/rejected": -197.5996856689453, + "loss": 0.9122, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.22556057572364807, + "rewards/margins": 0.10073423385620117, + "rewards/rejected": -0.32629480957984924, + "step": 7240 + }, + { + "epoch": 2.11, + "learning_rate": 1.2127079129534502e-07, + "logits/chosen": -2.7340056896209717, + "logits/rejected": -2.739253282546997, + "logps/chosen": -192.6339569091797, + "logps/rejected": -180.08802795410156, + "loss": 0.9046, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.20382554829120636, + "rewards/margins": 0.10780832916498184, + "rewards/rejected": -0.3116338849067688, + "step": 7250 + }, + { + "epoch": 2.12, + "learning_rate": 1.2054406288038787e-07, + "logits/chosen": -2.751988410949707, + "logits/rejected": -2.7704575061798096, + "logps/chosen": -205.4809112548828, + "logps/rejected": -196.97647094726562, + "loss": 0.9161, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19896064698696136, + "rewards/margins": 0.11849311739206314, + "rewards/rejected": -0.3174537718296051, + "step": 7260 + }, + { + "epoch": 2.12, + "learning_rate": 1.198188261206603e-07, + "logits/chosen": -2.7551045417785645, + "logits/rejected": -2.750485897064209, + "logps/chosen": -218.8869171142578, + "logps/rejected": -208.7373046875, + "loss": 0.9106, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.22199001908302307, + "rewards/margins": 0.1098296269774437, + "rewards/rejected": -0.33181968331336975, + "step": 7270 + }, + { + "epoch": 2.12, + "learning_rate": 1.1909508937269857e-07, + "logits/chosen": -2.7286875247955322, + "logits/rejected": -2.7499356269836426, + "logps/chosen": -191.07275390625, + "logps/rejected": -179.6157684326172, + "loss": 0.9193, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.1898176223039627, + "rewards/margins": 0.0719531923532486, + "rewards/rejected": -0.2617708146572113, + "step": 7280 + }, + { + "epoch": 2.13, + "learning_rate": 1.1837286097575466e-07, + "logits/chosen": -2.7580971717834473, + "logits/rejected": -2.7623705863952637, + "logps/chosen": -196.2259979248047, + "logps/rejected": -181.41830444335938, + "loss": 0.9182, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.21018242835998535, + "rewards/margins": 0.08354298770427704, + "rewards/rejected": -0.2937254309654236, + "step": 7290 + }, + { + "epoch": 2.13, + "learning_rate": 1.1765214925170097e-07, + "logits/chosen": -2.7369384765625, + "logits/rejected": -2.742072582244873, + "logps/chosen": -178.70603942871094, + "logps/rejected": -172.1625213623047, + "loss": 0.922, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.21116510033607483, + "rewards/margins": 0.08774997293949127, + "rewards/rejected": -0.2989150881767273, + "step": 7300 + }, + { + "epoch": 2.13, + "eval_logits/chosen": -2.656193256378174, + "eval_logits/rejected": -2.6509604454040527, + "eval_logps/chosen": -197.75233459472656, + "eval_logps/rejected": -183.85647583007812, + "eval_loss": 0.9315423369407654, + "eval_rewards/accuracies": 0.5791978240013123, + "eval_rewards/chosen": -0.2147025316953659, + "eval_rewards/margins": 0.07881684601306915, + "eval_rewards/rejected": -0.29351937770843506, + "eval_runtime": 443.3236, + "eval_samples_per_second": 26.538, + "eval_steps_per_second": 3.318, + "step": 7300 + }, + { + "epoch": 2.13, + "learning_rate": 1.1693296250493376e-07, + "logits/chosen": -2.7492427825927734, + "logits/rejected": -2.758079767227173, + "logps/chosen": -206.41268920898438, + "logps/rejected": -205.76123046875, + "loss": 0.9047, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.22261814773082733, + "rewards/margins": 0.10148187726736069, + "rewards/rejected": -0.3241000175476074, + "step": 7310 + }, + { + "epoch": 2.14, + "learning_rate": 1.1621530902227781e-07, + "logits/chosen": -2.745958089828491, + "logits/rejected": -2.737013339996338, + "logps/chosen": -192.68576049804688, + "logps/rejected": -179.25247192382812, + "loss": 0.9302, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22553372383117676, + "rewards/margins": 0.061601459980010986, + "rewards/rejected": -0.28713518381118774, + "step": 7320 + }, + { + "epoch": 2.14, + "learning_rate": 1.1549919707289088e-07, + "logits/chosen": -2.722991466522217, + "logits/rejected": -2.731642246246338, + "logps/chosen": -164.6597442626953, + "logps/rejected": -165.81527709960938, + "loss": 0.9152, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.1901760846376419, + "rewards/margins": 0.10527213662862778, + "rewards/rejected": -0.2954482436180115, + "step": 7330 + }, + { + "epoch": 2.14, + "learning_rate": 1.1478463490816858e-07, + "logits/chosen": -2.7464330196380615, + "logits/rejected": -2.751593589782715, + "logps/chosen": -188.0258331298828, + "logps/rejected": -181.29013061523438, + "loss": 0.9107, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.2331833839416504, + "rewards/margins": 0.08430353552103043, + "rewards/rejected": -0.3174869418144226, + "step": 7340 + }, + { + "epoch": 2.14, + "learning_rate": 1.1407163076164894e-07, + "logits/chosen": -2.7444779872894287, + "logits/rejected": -2.7680869102478027, + "logps/chosen": -190.80401611328125, + "logps/rejected": -192.6204833984375, + "loss": 0.912, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.23358038067817688, + "rewards/margins": 0.11961646378040314, + "rewards/rejected": -0.35319679975509644, + "step": 7350 + }, + { + "epoch": 2.15, + "learning_rate": 1.1336019284891767e-07, + "logits/chosen": -2.722039222717285, + "logits/rejected": -2.701793670654297, + "logps/chosen": -185.11410522460938, + "logps/rejected": -162.76284790039062, + "loss": 0.9138, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.19528807699680328, + "rewards/margins": 0.0973658487200737, + "rewards/rejected": -0.2926539182662964, + "step": 7360 + }, + { + "epoch": 2.15, + "learning_rate": 1.1265032936751395e-07, + "logits/chosen": -2.725271224975586, + "logits/rejected": -2.7608556747436523, + "logps/chosen": -188.7613067626953, + "logps/rejected": -195.23399353027344, + "loss": 0.8958, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22539019584655762, + "rewards/margins": 0.10492382943630219, + "rewards/rejected": -0.3303140103816986, + "step": 7370 + }, + { + "epoch": 2.15, + "learning_rate": 1.1194204849683509e-07, + "logits/chosen": -2.761343002319336, + "logits/rejected": -2.734705686569214, + "logps/chosen": -214.1074676513672, + "logps/rejected": -187.6863250732422, + "loss": 0.9251, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2229997217655182, + "rewards/margins": 0.06879208981990814, + "rewards/rejected": -0.29179179668426514, + "step": 7380 + }, + { + "epoch": 2.16, + "learning_rate": 1.1123535839804318e-07, + "logits/chosen": -2.7466766834259033, + "logits/rejected": -2.7543885707855225, + "logps/chosen": -204.26852416992188, + "logps/rejected": -199.51849365234375, + "loss": 0.9263, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.22250142693519592, + "rewards/margins": 0.08121421933174133, + "rewards/rejected": -0.30371564626693726, + "step": 7390 + }, + { + "epoch": 2.16, + "learning_rate": 1.1053026721397044e-07, + "logits/chosen": -2.734654426574707, + "logits/rejected": -2.7429187297821045, + "logps/chosen": -204.10427856445312, + "logps/rejected": -194.6341094970703, + "loss": 0.9136, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.21057207882404327, + "rewards/margins": 0.12202072143554688, + "rewards/rejected": -0.33259281516075134, + "step": 7400 + }, + { + "epoch": 2.16, + "eval_logits/chosen": -2.646413564682007, + "eval_logits/rejected": -2.640982151031494, + "eval_logps/chosen": -197.75148010253906, + "eval_logps/rejected": -183.86248779296875, + "eval_loss": 0.9313158392906189, + "eval_rewards/accuracies": 0.5819170475006104, + "eval_rewards/chosen": -0.21462056040763855, + "eval_rewards/margins": 0.07949841767549515, + "eval_rewards/rejected": -0.2941189706325531, + "eval_runtime": 443.2671, + "eval_samples_per_second": 26.542, + "eval_steps_per_second": 3.319, + "step": 7400 + }, + { + "epoch": 2.16, + "learning_rate": 1.0982678306902563e-07, + "logits/chosen": -2.750333786010742, + "logits/rejected": -2.7447495460510254, + "logps/chosen": -216.957275390625, + "logps/rejected": -195.1598663330078, + "loss": 0.9426, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26006537675857544, + "rewards/margins": 0.05888168886303902, + "rewards/rejected": -0.3189470171928406, + "step": 7410 + }, + { + "epoch": 2.16, + "learning_rate": 1.0912491406910032e-07, + "logits/chosen": -2.7206127643585205, + "logits/rejected": -2.7083652019500732, + "logps/chosen": -183.6171112060547, + "logps/rejected": -179.92808532714844, + "loss": 0.9218, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2206425666809082, + "rewards/margins": 0.06548481434583664, + "rewards/rejected": -0.28612738847732544, + "step": 7420 + }, + { + "epoch": 2.17, + "learning_rate": 1.084246683014759e-07, + "logits/chosen": -2.776256561279297, + "logits/rejected": -2.7593002319335938, + "logps/chosen": -201.40565490722656, + "logps/rejected": -187.70831298828125, + "loss": 0.9257, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21603736281394958, + "rewards/margins": 0.10193196684122086, + "rewards/rejected": -0.31796932220458984, + "step": 7430 + }, + { + "epoch": 2.17, + "learning_rate": 1.0772605383472977e-07, + "logits/chosen": -2.7164487838745117, + "logits/rejected": -2.7195496559143066, + "logps/chosen": -195.38851928710938, + "logps/rejected": -189.60653686523438, + "loss": 0.9409, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.23506739735603333, + "rewards/margins": 0.055074017494916916, + "rewards/rejected": -0.29014140367507935, + "step": 7440 + }, + { + "epoch": 2.17, + "learning_rate": 1.0702907871864267e-07, + "logits/chosen": -2.731732130050659, + "logits/rejected": -2.7372167110443115, + "logps/chosen": -195.2466583251953, + "logps/rejected": -184.58062744140625, + "loss": 0.9204, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.22982437908649445, + "rewards/margins": 0.08360082656145096, + "rewards/rejected": -0.3134251832962036, + "step": 7450 + }, + { + "epoch": 2.18, + "learning_rate": 1.0633375098410627e-07, + "logits/chosen": -2.71661639213562, + "logits/rejected": -2.7214877605438232, + "logps/chosen": -181.84878540039062, + "logps/rejected": -177.9984130859375, + "loss": 0.9279, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.236176535487175, + "rewards/margins": 0.08922503143548965, + "rewards/rejected": -0.32540157437324524, + "step": 7460 + }, + { + "epoch": 2.18, + "learning_rate": 1.0564007864302982e-07, + "logits/chosen": -2.7508246898651123, + "logits/rejected": -2.7447965145111084, + "logps/chosen": -208.2515411376953, + "logps/rejected": -188.55581665039062, + "loss": 0.926, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.24827198684215546, + "rewards/margins": 0.07217380404472351, + "rewards/rejected": -0.32044583559036255, + "step": 7470 + }, + { + "epoch": 2.18, + "learning_rate": 1.0494806968824877e-07, + "logits/chosen": -2.7589292526245117, + "logits/rejected": -2.7438571453094482, + "logps/chosen": -210.2908477783203, + "logps/rejected": -189.3018341064453, + "loss": 0.9249, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2212313860654831, + "rewards/margins": 0.08061711490154266, + "rewards/rejected": -0.30184850096702576, + "step": 7480 + }, + { + "epoch": 2.18, + "learning_rate": 1.0425773209343189e-07, + "logits/chosen": -2.736321210861206, + "logits/rejected": -2.7403640747070312, + "logps/chosen": -189.15023803710938, + "logps/rejected": -180.63282775878906, + "loss": 0.9258, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2023574411869049, + "rewards/margins": 0.11086982488632202, + "rewards/rejected": -0.31322723627090454, + "step": 7490 + }, + { + "epoch": 2.19, + "learning_rate": 1.0356907381298983e-07, + "logits/chosen": -2.7436957359313965, + "logits/rejected": -2.734558582305908, + "logps/chosen": -207.8624725341797, + "logps/rejected": -191.48626708984375, + "loss": 0.9401, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.22266311943531036, + "rewards/margins": 0.07991985976696014, + "rewards/rejected": -0.30258291959762573, + "step": 7500 + }, + { + "epoch": 2.19, + "eval_logits/chosen": -2.6543385982513428, + "eval_logits/rejected": -2.6490490436553955, + "eval_logps/chosen": -197.7451171875, + "eval_logps/rejected": -183.85833740234375, + "eval_loss": 0.9313808083534241, + "eval_rewards/accuracies": 0.5798776149749756, + "eval_rewards/chosen": -0.21397916972637177, + "eval_rewards/margins": 0.07972751557826996, + "eval_rewards/rejected": -0.2937066853046417, + "eval_runtime": 443.3645, + "eval_samples_per_second": 26.536, + "eval_steps_per_second": 3.318, + "step": 7500 + }, + { + "epoch": 2.19, + "learning_rate": 1.0288210278198319e-07, + "logits/chosen": -2.6996657848358154, + "logits/rejected": -2.7122485637664795, + "logps/chosen": -177.2788848876953, + "logps/rejected": -162.20631408691406, + "loss": 0.9112, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20859304070472717, + "rewards/margins": 0.1111995205283165, + "rewards/rejected": -0.31979256868362427, + "step": 7510 + }, + { + "epoch": 2.19, + "learning_rate": 1.0219682691603168e-07, + "logits/chosen": -2.7130396366119385, + "logits/rejected": -2.728550434112549, + "logps/chosen": -187.67735290527344, + "logps/rejected": -184.43748474121094, + "loss": 0.9016, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.19813624024391174, + "rewards/margins": 0.0911586657166481, + "rewards/rejected": -0.28929489850997925, + "step": 7520 + }, + { + "epoch": 2.2, + "learning_rate": 1.0151325411122206e-07, + "logits/chosen": -2.7337875366210938, + "logits/rejected": -2.734790325164795, + "logps/chosen": -184.33343505859375, + "logps/rejected": -181.77609252929688, + "loss": 0.9207, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.2145807445049286, + "rewards/margins": 0.06570100784301758, + "rewards/rejected": -0.28028178215026855, + "step": 7530 + }, + { + "epoch": 2.2, + "learning_rate": 1.0083139224401762e-07, + "logits/chosen": -2.7556958198547363, + "logits/rejected": -2.754171371459961, + "logps/chosen": -203.33828735351562, + "logps/rejected": -185.1107177734375, + "loss": 0.9272, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.197455495595932, + "rewards/margins": 0.0879480391740799, + "rewards/rejected": -0.2854035198688507, + "step": 7540 + }, + { + "epoch": 2.2, + "learning_rate": 1.0015124917116769e-07, + "logits/chosen": -2.75793719291687, + "logits/rejected": -2.7613115310668945, + "logps/chosen": -197.8968505859375, + "logps/rejected": -190.76565551757812, + "loss": 0.9236, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.2403552234172821, + "rewards/margins": 0.08378183841705322, + "rewards/rejected": -0.3241370618343353, + "step": 7550 + }, + { + "epoch": 2.21, + "learning_rate": 9.947283272961637e-08, + "logits/chosen": -2.722438335418701, + "logits/rejected": -2.7054498195648193, + "logps/chosen": -175.672607421875, + "logps/rejected": -162.665771484375, + "loss": 0.9231, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.1979292929172516, + "rewards/margins": 0.07674896717071533, + "rewards/rejected": -0.2746782898902893, + "step": 7560 + }, + { + "epoch": 2.21, + "learning_rate": 9.879615073641309e-08, + "logits/chosen": -2.763669729232788, + "logits/rejected": -2.7318906784057617, + "logps/chosen": -201.5901336669922, + "logps/rejected": -178.46498107910156, + "loss": 0.9179, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.19735045731067657, + "rewards/margins": 0.09358976781368256, + "rewards/rejected": -0.29094022512435913, + "step": 7570 + }, + { + "epoch": 2.21, + "learning_rate": 9.812121098862184e-08, + "logits/chosen": -2.7316806316375732, + "logits/rejected": -2.743900775909424, + "logps/chosen": -194.90371704101562, + "logps/rejected": -187.50155639648438, + "loss": 0.9325, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.22696837782859802, + "rewards/margins": 0.060942865908145905, + "rewards/rejected": -0.28791123628616333, + "step": 7580 + }, + { + "epoch": 2.21, + "learning_rate": 9.744802126323159e-08, + "logits/chosen": -2.7149295806884766, + "logits/rejected": -2.7186684608459473, + "logps/chosen": -214.962158203125, + "logps/rejected": -186.78811645507812, + "loss": 0.9203, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.23681148886680603, + "rewards/margins": 0.04845339059829712, + "rewards/rejected": -0.28526487946510315, + "step": 7590 + }, + { + "epoch": 2.22, + "learning_rate": 9.677658931706676e-08, + "logits/chosen": -2.726931095123291, + "logits/rejected": -2.715970516204834, + "logps/chosen": -189.72854614257812, + "logps/rejected": -176.05377197265625, + "loss": 0.9295, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2344752848148346, + "rewards/margins": 0.06562696397304535, + "rewards/rejected": -0.30010226368904114, + "step": 7600 + }, + { + "epoch": 2.22, + "eval_logits/chosen": -2.6620023250579834, + "eval_logits/rejected": -2.6568603515625, + "eval_logps/chosen": -197.7584991455078, + "eval_logps/rejected": -183.87474060058594, + "eval_loss": 0.9313289523124695, + "eval_rewards/accuracies": 0.581237256526947, + "eval_rewards/chosen": -0.21531949937343597, + "eval_rewards/margins": 0.08002565056085587, + "eval_rewards/rejected": -0.29534512758255005, + "eval_runtime": 443.2661, + "eval_samples_per_second": 26.542, + "eval_steps_per_second": 3.319, + "step": 7600 + }, + { + "epoch": 2.22, + "learning_rate": 9.61069228866979e-08, + "logits/chosen": -2.7586958408355713, + "logits/rejected": -2.759864330291748, + "logps/chosen": -206.00894165039062, + "logps/rejected": -192.7720947265625, + "loss": 0.9273, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.23331649601459503, + "rewards/margins": 0.08103559911251068, + "rewards/rejected": -0.3143520951271057, + "step": 7610 + }, + { + "epoch": 2.22, + "learning_rate": 9.543902968835219e-08, + "logits/chosen": -2.7187604904174805, + "logits/rejected": -2.7171874046325684, + "logps/chosen": -193.1675262451172, + "logps/rejected": -175.18508911132812, + "loss": 0.9277, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2174643725156784, + "rewards/margins": 0.06399818509817123, + "rewards/rejected": -0.28146255016326904, + "step": 7620 + }, + { + "epoch": 2.23, + "learning_rate": 9.477291741782478e-08, + "logits/chosen": -2.7655746936798096, + "logits/rejected": -2.743079662322998, + "logps/chosen": -214.4647979736328, + "logps/rejected": -198.39024353027344, + "loss": 0.946, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.25131624937057495, + "rewards/margins": 0.06656187027692795, + "rewards/rejected": -0.3178780972957611, + "step": 7630 + }, + { + "epoch": 2.23, + "learning_rate": 9.410859375039034e-08, + "logits/chosen": -2.759535312652588, + "logits/rejected": -2.746966600418091, + "logps/chosen": -194.54786682128906, + "logps/rejected": -180.0648193359375, + "loss": 0.9223, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.21222415566444397, + "rewards/margins": 0.07545609772205353, + "rewards/rejected": -0.2876802384853363, + "step": 7640 + }, + { + "epoch": 2.23, + "learning_rate": 9.3446066340714e-08, + "logits/chosen": -2.757150888442993, + "logits/rejected": -2.7474305629730225, + "logps/chosen": -220.7982635498047, + "logps/rejected": -201.95033264160156, + "loss": 0.9162, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.1936686933040619, + "rewards/margins": 0.11243748664855957, + "rewards/rejected": -0.3061061501502991, + "step": 7650 + }, + { + "epoch": 2.23, + "learning_rate": 9.27853428227639e-08, + "logits/chosen": -2.767207622528076, + "logits/rejected": -2.7521488666534424, + "logps/chosen": -209.8219757080078, + "logps/rejected": -188.6089324951172, + "loss": 0.9515, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.2451823651790619, + "rewards/margins": 0.04614636301994324, + "rewards/rejected": -0.2913287281990051, + "step": 7660 + }, + { + "epoch": 2.24, + "learning_rate": 9.21264308097226e-08, + "logits/chosen": -2.719449758529663, + "logits/rejected": -2.7556424140930176, + "logps/chosen": -165.529052734375, + "logps/rejected": -181.20455932617188, + "loss": 0.9016, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.22411219775676727, + "rewards/margins": 0.09404568374156952, + "rewards/rejected": -0.31815794110298157, + "step": 7670 + }, + { + "epoch": 2.24, + "learning_rate": 9.146933789389963e-08, + "logits/chosen": -2.7221908569335938, + "logits/rejected": -2.710775852203369, + "logps/chosen": -193.11004638671875, + "logps/rejected": -176.21279907226562, + "loss": 0.9268, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.20240387320518494, + "rewards/margins": 0.06890231370925903, + "rewards/rejected": -0.27130618691444397, + "step": 7680 + }, + { + "epoch": 2.24, + "learning_rate": 9.081407164664396e-08, + "logits/chosen": -2.761277914047241, + "logits/rejected": -2.739208459854126, + "logps/chosen": -220.0433807373047, + "logps/rejected": -195.45925903320312, + "loss": 0.939, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.23591816425323486, + "rewards/margins": 0.08795022964477539, + "rewards/rejected": -0.32386839389801025, + "step": 7690 + }, + { + "epoch": 2.25, + "learning_rate": 9.016063961825698e-08, + "logits/chosen": -2.7358932495117188, + "logits/rejected": -2.7563042640686035, + "logps/chosen": -190.58761596679688, + "logps/rejected": -178.07778930664062, + "loss": 0.9128, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.19711415469646454, + "rewards/margins": 0.09829168021678925, + "rewards/rejected": -0.2954058349132538, + "step": 7700 + }, + { + "epoch": 2.25, + "eval_logits/chosen": -2.655303716659546, + "eval_logits/rejected": -2.65004301071167, + "eval_logps/chosen": -197.75904846191406, + "eval_logps/rejected": -183.8813934326172, + "eval_loss": 0.9308503270149231, + "eval_rewards/accuracies": 0.5817471146583557, + "eval_rewards/chosen": -0.21537372469902039, + "eval_rewards/margins": 0.08063706010580063, + "eval_rewards/rejected": -0.2960107922554016, + "eval_runtime": 443.7199, + "eval_samples_per_second": 26.514, + "eval_steps_per_second": 3.315, + "step": 7700 + }, + { + "epoch": 2.25, + "learning_rate": 8.950904933790512e-08, + "logits/chosen": -2.725898265838623, + "logits/rejected": -2.7195515632629395, + "logps/chosen": -189.76937866210938, + "logps/rejected": -179.32357788085938, + "loss": 0.9147, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.18170380592346191, + "rewards/margins": 0.10253002494573593, + "rewards/rejected": -0.28423386812210083, + "step": 7710 + }, + { + "epoch": 2.25, + "learning_rate": 8.885930831353328e-08, + "logits/chosen": -2.709311008453369, + "logits/rejected": -2.7129642963409424, + "logps/chosen": -189.28977966308594, + "logps/rejected": -172.34219360351562, + "loss": 0.9186, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.21395018696784973, + "rewards/margins": 0.06539814174175262, + "rewards/rejected": -0.27934831380844116, + "step": 7720 + }, + { + "epoch": 2.25, + "learning_rate": 8.821142403177845e-08, + "logits/chosen": -2.728788137435913, + "logits/rejected": -2.7360405921936035, + "logps/chosen": -196.939453125, + "logps/rejected": -191.22731018066406, + "loss": 0.9326, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.21163785457611084, + "rewards/margins": 0.06347735226154327, + "rewards/rejected": -0.2751151919364929, + "step": 7730 + }, + { + "epoch": 2.26, + "learning_rate": 8.756540395788311e-08, + "logits/chosen": -2.734846591949463, + "logits/rejected": -2.7414629459381104, + "logps/chosen": -197.96871948242188, + "logps/rejected": -188.20376586914062, + "loss": 0.9393, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.21204037964344025, + "rewards/margins": 0.07741545140743256, + "rewards/rejected": -0.2894558310508728, + "step": 7740 + }, + { + "epoch": 2.26, + "learning_rate": 8.692125553560976e-08, + "logits/chosen": -2.717087984085083, + "logits/rejected": -2.740471363067627, + "logps/chosen": -194.17306518554688, + "logps/rejected": -194.03067016601562, + "loss": 0.9164, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.25672441720962524, + "rewards/margins": 0.08338448405265808, + "rewards/rejected": -0.3401089310646057, + "step": 7750 + }, + { + "epoch": 2.26, + "learning_rate": 8.62789861871544e-08, + "logits/chosen": -2.704550266265869, + "logits/rejected": -2.7091641426086426, + "logps/chosen": -188.353515625, + "logps/rejected": -172.57882690429688, + "loss": 0.9066, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.20504169166088104, + "rewards/margins": 0.09637182950973511, + "rewards/rejected": -0.30141353607177734, + "step": 7760 + }, + { + "epoch": 2.27, + "learning_rate": 8.563860331306158e-08, + "logits/chosen": -2.728397846221924, + "logits/rejected": -2.7323508262634277, + "logps/chosen": -194.7670135498047, + "logps/rejected": -179.7714080810547, + "loss": 0.928, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2068866491317749, + "rewards/margins": 0.08743634819984436, + "rewards/rejected": -0.29432302713394165, + "step": 7770 + }, + { + "epoch": 2.27, + "learning_rate": 8.500011429213916e-08, + "logits/chosen": -2.733534574508667, + "logits/rejected": -2.7395362854003906, + "logps/chosen": -196.14666748046875, + "logps/rejected": -186.81178283691406, + "loss": 0.9252, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2137400358915329, + "rewards/margins": 0.09642390161752701, + "rewards/rejected": -0.3101639747619629, + "step": 7780 + }, + { + "epoch": 2.27, + "learning_rate": 8.43635264813728e-08, + "logits/chosen": -2.758620262145996, + "logits/rejected": -2.750257730484009, + "logps/chosen": -211.9737548828125, + "logps/rejected": -193.65223693847656, + "loss": 0.9105, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1767754703760147, + "rewards/margins": 0.12258114665746689, + "rewards/rejected": -0.2993565797805786, + "step": 7790 + }, + { + "epoch": 2.28, + "learning_rate": 8.372884721584155e-08, + "logits/chosen": -2.7193734645843506, + "logits/rejected": -2.7123632431030273, + "logps/chosen": -184.4526824951172, + "logps/rejected": -167.96360778808594, + "loss": 0.9074, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.20129454135894775, + "rewards/margins": 0.10267508029937744, + "rewards/rejected": -0.3039696216583252, + "step": 7800 + }, + { + "epoch": 2.28, + "eval_logits/chosen": -2.6557297706604004, + "eval_logits/rejected": -2.6504623889923096, + "eval_logps/chosen": -197.76478576660156, + "eval_logps/rejected": -183.8850555419922, + "eval_loss": 0.9311633706092834, + "eval_rewards/accuracies": 0.5836166143417358, + "eval_rewards/chosen": -0.2159472107887268, + "eval_rewards/margins": 0.08042870461940765, + "eval_rewards/rejected": -0.29637593030929565, + "eval_runtime": 443.4949, + "eval_samples_per_second": 26.528, + "eval_steps_per_second": 3.317, + "step": 7800 + }, + { + "epoch": 2.28, + "learning_rate": 8.309608380863328e-08, + "logits/chosen": -2.7333827018737793, + "logits/rejected": -2.7506890296936035, + "logps/chosen": -175.94215393066406, + "logps/rejected": -184.37400817871094, + "loss": 0.9142, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.2469567507505417, + "rewards/margins": 0.07817981392145157, + "rewards/rejected": -0.32513657212257385, + "step": 7810 + }, + { + "epoch": 2.28, + "learning_rate": 8.246524355076057e-08, + "logits/chosen": -2.7284388542175293, + "logits/rejected": -2.736680030822754, + "logps/chosen": -192.95489501953125, + "logps/rejected": -193.68785095214844, + "loss": 0.9323, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2360697090625763, + "rewards/margins": 0.10624003410339355, + "rewards/rejected": -0.34230974316596985, + "step": 7820 + }, + { + "epoch": 2.28, + "learning_rate": 8.183633371107615e-08, + "logits/chosen": -2.7205255031585693, + "logits/rejected": -2.7430524826049805, + "logps/chosen": -182.37574768066406, + "logps/rejected": -185.91189575195312, + "loss": 0.9101, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.2163948267698288, + "rewards/margins": 0.08373256027698517, + "rewards/rejected": -0.3001273572444916, + "step": 7830 + }, + { + "epoch": 2.29, + "learning_rate": 8.120936153618996e-08, + "logits/chosen": -2.704521894454956, + "logits/rejected": -2.7069900035858154, + "logps/chosen": -181.1570281982422, + "logps/rejected": -166.2770233154297, + "loss": 0.9296, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2072068452835083, + "rewards/margins": 0.07812671363353729, + "rewards/rejected": -0.2853335738182068, + "step": 7840 + }, + { + "epoch": 2.29, + "learning_rate": 8.058433425038483e-08, + "logits/chosen": -2.7473597526550293, + "logits/rejected": -2.7511062622070312, + "logps/chosen": -181.43115234375, + "logps/rejected": -170.19798278808594, + "loss": 0.9204, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2030690610408783, + "rewards/margins": 0.08421279489994049, + "rewards/rejected": -0.28728190064430237, + "step": 7850 + }, + { + "epoch": 2.29, + "learning_rate": 7.996125905553375e-08, + "logits/chosen": -2.7208352088928223, + "logits/rejected": -2.7141966819763184, + "logps/chosen": -184.7169952392578, + "logps/rejected": -184.3052520751953, + "loss": 0.9415, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.2215987890958786, + "rewards/margins": 0.07141195237636566, + "rewards/rejected": -0.29301077127456665, + "step": 7860 + }, + { + "epoch": 2.3, + "learning_rate": 7.934014313101692e-08, + "logits/chosen": -2.749361515045166, + "logits/rejected": -2.7265572547912598, + "logps/chosen": -219.4988555908203, + "logps/rejected": -185.86529541015625, + "loss": 0.9196, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.21059224009513855, + "rewards/margins": 0.10057976096868515, + "rewards/rejected": -0.3111719787120819, + "step": 7870 + }, + { + "epoch": 2.3, + "learning_rate": 7.87209936336386e-08, + "logits/chosen": -2.746774911880493, + "logits/rejected": -2.7589919567108154, + "logps/chosen": -197.74710083007812, + "logps/rejected": -199.05357360839844, + "loss": 0.9093, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.20615649223327637, + "rewards/margins": 0.09410884976387024, + "rewards/rejected": -0.3002653121948242, + "step": 7880 + }, + { + "epoch": 2.3, + "learning_rate": 7.810381769754506e-08, + "logits/chosen": -2.752405881881714, + "logits/rejected": -2.7523396015167236, + "logps/chosen": -205.9868621826172, + "logps/rejected": -192.83425903320312, + "loss": 0.9399, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2012767344713211, + "rewards/margins": 0.0421002134680748, + "rewards/rejected": -0.2433769404888153, + "step": 7890 + }, + { + "epoch": 2.3, + "learning_rate": 7.748862243414211e-08, + "logits/chosen": -2.7154974937438965, + "logits/rejected": -2.726799488067627, + "logps/chosen": -193.008544921875, + "logps/rejected": -182.70101928710938, + "loss": 0.9114, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.20253929495811462, + "rewards/margins": 0.07466837018728256, + "rewards/rejected": -0.2772076725959778, + "step": 7900 + }, + { + "epoch": 2.3, + "eval_logits/chosen": -2.6478512287139893, + "eval_logits/rejected": -2.642455577850342, + "eval_logps/chosen": -197.75442504882812, + "eval_logps/rejected": -183.87030029296875, + "eval_loss": 0.9310198426246643, + "eval_rewards/accuracies": 0.5836166143417358, + "eval_rewards/chosen": -0.21491007506847382, + "eval_rewards/margins": 0.07999106496572495, + "eval_rewards/rejected": -0.29490113258361816, + "eval_runtime": 443.5742, + "eval_samples_per_second": 26.523, + "eval_steps_per_second": 3.316, + "step": 7900 + }, + { + "epoch": 2.31, + "learning_rate": 7.687541493201347e-08, + "logits/chosen": -2.7405920028686523, + "logits/rejected": -2.7437314987182617, + "logps/chosen": -190.9894256591797, + "logps/rejected": -188.3881378173828, + "loss": 0.9488, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2341761589050293, + "rewards/margins": 0.03301035612821579, + "rewards/rejected": -0.2671864926815033, + "step": 7910 + }, + { + "epoch": 2.31, + "learning_rate": 7.62642022568386e-08, + "logits/chosen": -2.7333028316497803, + "logits/rejected": -2.7367300987243652, + "logps/chosen": -195.2904815673828, + "logps/rejected": -180.70547485351562, + "loss": 0.8981, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.19127438962459564, + "rewards/margins": 0.07776583731174469, + "rewards/rejected": -0.26904022693634033, + "step": 7920 + }, + { + "epoch": 2.31, + "learning_rate": 7.565499145131193e-08, + "logits/chosen": -2.7502613067626953, + "logits/rejected": -2.708519458770752, + "logps/chosen": -222.30517578125, + "logps/rejected": -176.55001831054688, + "loss": 0.9368, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.2366447001695633, + "rewards/margins": 0.05661622807383537, + "rewards/rejected": -0.29326093196868896, + "step": 7930 + }, + { + "epoch": 2.32, + "learning_rate": 7.50477895350611e-08, + "logits/chosen": -2.740562677383423, + "logits/rejected": -2.7238717079162598, + "logps/chosen": -211.9696044921875, + "logps/rejected": -192.79771423339844, + "loss": 0.929, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.21781253814697266, + "rewards/margins": 0.11220844089984894, + "rewards/rejected": -0.3300209641456604, + "step": 7940 + }, + { + "epoch": 2.32, + "learning_rate": 7.444260350456632e-08, + "logits/chosen": -2.7244935035705566, + "logits/rejected": -2.710855007171631, + "logps/chosen": -215.6509246826172, + "logps/rejected": -184.55307006835938, + "loss": 0.9274, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.230869323015213, + "rewards/margins": 0.07824772596359253, + "rewards/rejected": -0.3091171085834503, + "step": 7950 + }, + { + "epoch": 2.32, + "learning_rate": 7.383944033308003e-08, + "logits/chosen": -2.6875998973846436, + "logits/rejected": -2.71150279045105, + "logps/chosen": -199.21945190429688, + "logps/rejected": -196.64727783203125, + "loss": 0.9031, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.17514397203922272, + "rewards/margins": 0.11656300723552704, + "rewards/rejected": -0.29170694947242737, + "step": 7960 + }, + { + "epoch": 2.32, + "learning_rate": 7.323830697054592e-08, + "logits/chosen": -2.7250888347625732, + "logits/rejected": -2.7495064735412598, + "logps/chosen": -184.5178985595703, + "logps/rejected": -180.9095916748047, + "loss": 0.9147, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.21628186106681824, + "rewards/margins": 0.11150647699832916, + "rewards/rejected": -0.3277883529663086, + "step": 7970 + }, + { + "epoch": 2.33, + "learning_rate": 7.263921034351972e-08, + "logits/chosen": -2.726407527923584, + "logits/rejected": -2.7399487495422363, + "logps/chosen": -205.93057250976562, + "logps/rejected": -196.50315856933594, + "loss": 0.9267, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.2090451419353485, + "rewards/margins": 0.09366326034069061, + "rewards/rejected": -0.3027084171772003, + "step": 7980 + }, + { + "epoch": 2.33, + "learning_rate": 7.204215735508821e-08, + "logits/chosen": -2.7203097343444824, + "logits/rejected": -2.7385830879211426, + "logps/chosen": -188.94375610351562, + "logps/rejected": -196.45272827148438, + "loss": 0.9054, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.21457454562187195, + "rewards/margins": 0.13003358244895935, + "rewards/rejected": -0.3446081578731537, + "step": 7990 + }, + { + "epoch": 2.33, + "learning_rate": 7.1447154884791e-08, + "logits/chosen": -2.754638910293579, + "logits/rejected": -2.7675697803497314, + "logps/chosen": -206.0548553466797, + "logps/rejected": -201.75033569335938, + "loss": 0.9181, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.24122412502765656, + "rewards/margins": 0.09840685874223709, + "rewards/rejected": -0.33963099122047424, + "step": 8000 + }, + { + "epoch": 2.33, + "eval_logits/chosen": -2.6661343574523926, + "eval_logits/rejected": -2.6611063480377197, + "eval_logps/chosen": -197.75009155273438, + "eval_logps/rejected": -183.85853576660156, + "eval_loss": 0.9317770004272461, + "eval_rewards/accuracies": 0.5771583914756775, + "eval_rewards/chosen": -0.21447615325450897, + "eval_rewards/margins": 0.07924765348434448, + "eval_rewards/rejected": -0.29372379183769226, + "eval_runtime": 443.4039, + "eval_samples_per_second": 26.533, + "eval_steps_per_second": 3.318, + "step": 8000 + }, + { + "epoch": 2.34, + "learning_rate": 7.08542097885402e-08, + "logits/chosen": -2.739370346069336, + "logits/rejected": -2.739750862121582, + "logps/chosen": -168.8159942626953, + "logps/rejected": -170.51507568359375, + "loss": 0.93, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.2074386328458786, + "rewards/margins": 0.06587998569011688, + "rewards/rejected": -0.2733186185359955, + "step": 8010 + }, + { + "epoch": 2.34, + "learning_rate": 7.026332889854212e-08, + "logits/chosen": -2.7454347610473633, + "logits/rejected": -2.7481212615966797, + "logps/chosen": -191.80398559570312, + "logps/rejected": -169.96246337890625, + "loss": 0.9527, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.21735993027687073, + "rewards/margins": 0.05763064697384834, + "rewards/rejected": -0.2749905586242676, + "step": 8020 + }, + { + "epoch": 2.34, + "learning_rate": 6.967451902321808e-08, + "logits/chosen": -2.7633728981018066, + "logits/rejected": -2.762784481048584, + "logps/chosen": -207.1423797607422, + "logps/rejected": -194.453125, + "loss": 0.9206, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2185828685760498, + "rewards/margins": 0.08463943004608154, + "rewards/rejected": -0.30322226881980896, + "step": 8030 + }, + { + "epoch": 2.35, + "learning_rate": 6.908778694712611e-08, + "logits/chosen": -2.7394168376922607, + "logits/rejected": -2.7601752281188965, + "logps/chosen": -197.01817321777344, + "logps/rejected": -191.726806640625, + "loss": 0.9391, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2251404970884323, + "rewards/margins": 0.07005010545253754, + "rewards/rejected": -0.29519063234329224, + "step": 8040 + }, + { + "epoch": 2.35, + "learning_rate": 6.85031394308831e-08, + "logits/chosen": -2.749958038330078, + "logits/rejected": -2.7379648685455322, + "logps/chosen": -193.93630981445312, + "logps/rejected": -188.49380493164062, + "loss": 0.9394, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.24032609164714813, + "rewards/margins": 0.06645093858242035, + "rewards/rejected": -0.3067770302295685, + "step": 8050 + }, + { + "epoch": 2.35, + "learning_rate": 6.792058321108621e-08, + "logits/chosen": -2.73541522026062, + "logits/rejected": -2.743712902069092, + "logps/chosen": -181.5113983154297, + "logps/rejected": -181.34979248046875, + "loss": 0.9196, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21504303812980652, + "rewards/margins": 0.08492358773946762, + "rewards/rejected": -0.29996663331985474, + "step": 8060 + }, + { + "epoch": 2.35, + "learning_rate": 6.734012500023608e-08, + "logits/chosen": -2.724332332611084, + "logits/rejected": -2.699763298034668, + "logps/chosen": -204.10018920898438, + "logps/rejected": -174.8278045654297, + "loss": 0.9323, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.23785284161567688, + "rewards/margins": 0.07769812643527985, + "rewards/rejected": -0.3155509829521179, + "step": 8070 + }, + { + "epoch": 2.36, + "learning_rate": 6.67617714866586e-08, + "logits/chosen": -2.7390198707580566, + "logits/rejected": -2.7389464378356934, + "logps/chosen": -196.2102508544922, + "logps/rejected": -187.58624267578125, + "loss": 0.9284, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.23619568347930908, + "rewards/margins": 0.07881676405668259, + "rewards/rejected": -0.31501245498657227, + "step": 8080 + }, + { + "epoch": 2.36, + "learning_rate": 6.618552933442869e-08, + "logits/chosen": -2.71484637260437, + "logits/rejected": -2.718121290206909, + "logps/chosen": -188.6407012939453, + "logps/rejected": -179.3391571044922, + "loss": 0.9224, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.20391671359539032, + "rewards/margins": 0.10935616493225098, + "rewards/rejected": -0.3132728636264801, + "step": 8090 + }, + { + "epoch": 2.36, + "learning_rate": 6.561140518329287e-08, + "logits/chosen": -2.669769525527954, + "logits/rejected": -2.695160150527954, + "logps/chosen": -166.22305297851562, + "logps/rejected": -174.75186157226562, + "loss": 0.9009, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.18055148422718048, + "rewards/margins": 0.09363645315170288, + "rewards/rejected": -0.27418795228004456, + "step": 8100 + }, + { + "epoch": 2.36, + "eval_logits/chosen": -2.6632046699523926, + "eval_logits/rejected": -2.658102512359619, + "eval_logps/chosen": -197.7543182373047, + "eval_logps/rejected": -183.87362670898438, + "eval_loss": 0.9310758709907532, + "eval_rewards/accuracies": 0.5798776149749756, + "eval_rewards/chosen": -0.21490146219730377, + "eval_rewards/margins": 0.08033359050750732, + "eval_rewards/rejected": -0.2952350676059723, + "eval_runtime": 443.3937, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 8100 + }, + { + "epoch": 2.37, + "learning_rate": 6.503940564859317e-08, + "logits/chosen": -2.7465438842773438, + "logits/rejected": -2.7310779094696045, + "logps/chosen": -203.1905059814453, + "logps/rejected": -194.52020263671875, + "loss": 0.9262, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24113135039806366, + "rewards/margins": 0.06915220618247986, + "rewards/rejected": -0.3102835714817047, + "step": 8110 + }, + { + "epoch": 2.37, + "learning_rate": 6.446953732119068e-08, + "logits/chosen": -2.7239115238189697, + "logits/rejected": -2.7100701332092285, + "logps/chosen": -189.4662322998047, + "logps/rejected": -181.71133422851562, + "loss": 0.9064, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.22534950077533722, + "rewards/margins": 0.08933541178703308, + "rewards/rejected": -0.3146849274635315, + "step": 8120 + }, + { + "epoch": 2.37, + "learning_rate": 6.390180676738949e-08, + "logits/chosen": -2.742435932159424, + "logits/rejected": -2.728344678878784, + "logps/chosen": -213.70993041992188, + "logps/rejected": -189.58584594726562, + "loss": 0.9412, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.24831977486610413, + "rewards/margins": 0.04875670000910759, + "rewards/rejected": -0.2970764935016632, + "step": 8130 + }, + { + "epoch": 2.37, + "learning_rate": 6.333622052886159e-08, + "logits/chosen": -2.709750175476074, + "logits/rejected": -2.716714859008789, + "logps/chosen": -202.9120635986328, + "logps/rejected": -185.79696655273438, + "loss": 0.9295, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2223753035068512, + "rewards/margins": 0.07038509845733643, + "rewards/rejected": -0.29276034235954285, + "step": 8140 + }, + { + "epoch": 2.38, + "learning_rate": 6.277278512257073e-08, + "logits/chosen": -2.7267911434173584, + "logits/rejected": -2.747617483139038, + "logps/chosen": -220.88833618164062, + "logps/rejected": -208.8884735107422, + "loss": 0.9105, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.22989530861377716, + "rewards/margins": 0.12462420761585236, + "rewards/rejected": -0.3545195162296295, + "step": 8150 + }, + { + "epoch": 2.38, + "learning_rate": 6.221150704069808e-08, + "logits/chosen": -2.728583574295044, + "logits/rejected": -2.742826461791992, + "logps/chosen": -177.401123046875, + "logps/rejected": -175.57814025878906, + "loss": 0.91, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.22042067348957062, + "rewards/margins": 0.07267331331968307, + "rewards/rejected": -0.2930939793586731, + "step": 8160 + }, + { + "epoch": 2.38, + "learning_rate": 6.165239275056658e-08, + "logits/chosen": -2.7489070892333984, + "logits/rejected": -2.728541135787964, + "logps/chosen": -202.7565460205078, + "logps/rejected": -191.60960388183594, + "loss": 0.9296, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.21797093749046326, + "rewards/margins": 0.07035072892904282, + "rewards/rejected": -0.28832167387008667, + "step": 8170 + }, + { + "epoch": 2.39, + "learning_rate": 6.109544869456734e-08, + "logits/chosen": -2.7465271949768066, + "logits/rejected": -2.7321600914001465, + "logps/chosen": -217.69332885742188, + "logps/rejected": -183.03245544433594, + "loss": 0.9104, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.20488587021827698, + "rewards/margins": 0.1237928494811058, + "rewards/rejected": -0.32867875695228577, + "step": 8180 + }, + { + "epoch": 2.39, + "learning_rate": 6.054068129008464e-08, + "logits/chosen": -2.7363193035125732, + "logits/rejected": -2.724510669708252, + "logps/chosen": -197.43472290039062, + "logps/rejected": -179.55078125, + "loss": 0.9333, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.22554786503314972, + "rewards/margins": 0.07664482295513153, + "rewards/rejected": -0.30219265818595886, + "step": 8190 + }, + { + "epoch": 2.39, + "learning_rate": 5.998809692942261e-08, + "logits/chosen": -2.7568459510803223, + "logits/rejected": -2.726026773452759, + "logps/chosen": -224.9655303955078, + "logps/rejected": -181.2517852783203, + "loss": 0.9091, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.20677609741687775, + "rewards/margins": 0.0898161232471466, + "rewards/rejected": -0.29659220576286316, + "step": 8200 + }, + { + "epoch": 2.39, + "eval_logits/chosen": -2.643325090408325, + "eval_logits/rejected": -2.6378369331359863, + "eval_logps/chosen": -197.7702178955078, + "eval_logps/rejected": -183.88160705566406, + "eval_loss": 0.9310687184333801, + "eval_rewards/accuracies": 0.5829367637634277, + "eval_rewards/chosen": -0.21649178862571716, + "eval_rewards/margins": 0.0795392096042633, + "eval_rewards/rejected": -0.29603099822998047, + "eval_runtime": 443.4095, + "eval_samples_per_second": 26.533, + "eval_steps_per_second": 3.317, + "step": 8200 + }, + { + "epoch": 2.39, + "learning_rate": 5.9437701979730995e-08, + "logits/chosen": -2.7097978591918945, + "logits/rejected": -2.7331063747406006, + "logps/chosen": -183.22218322753906, + "logps/rejected": -181.6038360595703, + "loss": 0.9293, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2365005761384964, + "rewards/margins": 0.07266677170991898, + "rewards/rejected": -0.3091673254966736, + "step": 8210 + }, + { + "epoch": 2.4, + "learning_rate": 5.8889502782932174e-08, + "logits/chosen": -2.7132773399353027, + "logits/rejected": -2.741889238357544, + "logps/chosen": -187.0943145751953, + "logps/rejected": -183.83511352539062, + "loss": 0.9098, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2061687409877777, + "rewards/margins": 0.11216280609369278, + "rewards/rejected": -0.3183315396308899, + "step": 8220 + }, + { + "epoch": 2.4, + "learning_rate": 5.834350565564805e-08, + "logits/chosen": -2.7249484062194824, + "logits/rejected": -2.7134346961975098, + "logps/chosen": -182.797119140625, + "logps/rejected": -170.59536743164062, + "loss": 0.9177, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.19757375121116638, + "rewards/margins": 0.07276350259780884, + "rewards/rejected": -0.27033722400665283, + "step": 8230 + }, + { + "epoch": 2.4, + "learning_rate": 5.779971688912702e-08, + "logits/chosen": -2.735774517059326, + "logits/rejected": -2.743027448654175, + "logps/chosen": -181.68629455566406, + "logps/rejected": -188.88807678222656, + "loss": 0.9373, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2310686856508255, + "rewards/margins": 0.07609544694423676, + "rewards/rejected": -0.30716413259506226, + "step": 8240 + }, + { + "epoch": 2.41, + "learning_rate": 5.725814274917198e-08, + "logits/chosen": -2.7728989124298096, + "logits/rejected": -2.724799394607544, + "logps/chosen": -223.89620971679688, + "logps/rejected": -192.3927001953125, + "loss": 0.9281, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.2514110803604126, + "rewards/margins": 0.06689505279064178, + "rewards/rejected": -0.3183061480522156, + "step": 8250 + }, + { + "epoch": 2.41, + "learning_rate": 5.6718789476067287e-08, + "logits/chosen": -2.734097957611084, + "logits/rejected": -2.746814489364624, + "logps/chosen": -203.88644409179688, + "logps/rejected": -201.47889709472656, + "loss": 0.9254, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.21979546546936035, + "rewards/margins": 0.08329310268163681, + "rewards/rejected": -0.30308857560157776, + "step": 8260 + }, + { + "epoch": 2.41, + "learning_rate": 5.618166328450788e-08, + "logits/chosen": -2.7586183547973633, + "logits/rejected": -2.7315571308135986, + "logps/chosen": -208.2580108642578, + "logps/rejected": -175.76177978515625, + "loss": 0.9377, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2211875170469284, + "rewards/margins": 0.069062739610672, + "rewards/rejected": -0.2902502417564392, + "step": 8270 + }, + { + "epoch": 2.42, + "learning_rate": 5.564677036352683e-08, + "logits/chosen": -2.7465243339538574, + "logits/rejected": -2.7541754245758057, + "logps/chosen": -196.60305786132812, + "logps/rejected": -199.50289916992188, + "loss": 0.9082, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.2293652594089508, + "rewards/margins": 0.10975439846515656, + "rewards/rejected": -0.33911964297294617, + "step": 8280 + }, + { + "epoch": 2.42, + "learning_rate": 5.5114116876424643e-08, + "logits/chosen": -2.75584077835083, + "logits/rejected": -2.746237277984619, + "logps/chosen": -217.4754180908203, + "logps/rejected": -197.7372283935547, + "loss": 0.9273, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.21786010265350342, + "rewards/margins": 0.09159267693758011, + "rewards/rejected": -0.3094528019428253, + "step": 8290 + }, + { + "epoch": 2.42, + "learning_rate": 5.4583708960697695e-08, + "logits/chosen": -2.737339735031128, + "logits/rejected": -2.761617660522461, + "logps/chosen": -214.1763916015625, + "logps/rejected": -195.55984497070312, + "loss": 0.9091, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2552254796028137, + "rewards/margins": 0.09345632046461105, + "rewards/rejected": -0.348681777715683, + "step": 8300 + }, + { + "epoch": 2.42, + "eval_logits/chosen": -2.652771234512329, + "eval_logits/rejected": -2.6474828720092773, + "eval_logps/chosen": -197.7509765625, + "eval_logps/rejected": -183.8717041015625, + "eval_loss": 0.9311734437942505, + "eval_rewards/accuracies": 0.5832766890525818, + "eval_rewards/chosen": -0.21456702053546906, + "eval_rewards/margins": 0.08047395944595337, + "eval_rewards/rejected": -0.29504096508026123, + "eval_runtime": 443.4337, + "eval_samples_per_second": 26.532, + "eval_steps_per_second": 3.317, + "step": 8300 + }, + { + "epoch": 2.42, + "learning_rate": 5.405555272796788e-08, + "logits/chosen": -2.7279231548309326, + "logits/rejected": -2.6993558406829834, + "logps/chosen": -218.52407836914062, + "logps/rejected": -187.58221435546875, + "loss": 0.9191, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.2137344628572464, + "rewards/margins": 0.08393244445323944, + "rewards/rejected": -0.2976669371128082, + "step": 8310 + }, + { + "epoch": 2.43, + "learning_rate": 5.3529654263912205e-08, + "logits/chosen": -2.746372938156128, + "logits/rejected": -2.7346367835998535, + "logps/chosen": -228.1139678955078, + "logps/rejected": -198.20999145507812, + "loss": 0.9182, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2145860195159912, + "rewards/margins": 0.09652328491210938, + "rewards/rejected": -0.3111093044281006, + "step": 8320 + }, + { + "epoch": 2.43, + "learning_rate": 5.300601962819229e-08, + "logits/chosen": -2.6986913681030273, + "logits/rejected": -2.711686372756958, + "logps/chosen": -177.49481201171875, + "logps/rejected": -178.4385986328125, + "loss": 0.9027, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.22211650013923645, + "rewards/margins": 0.12292201817035675, + "rewards/rejected": -0.3450385332107544, + "step": 8330 + }, + { + "epoch": 2.43, + "learning_rate": 5.248465485438522e-08, + "logits/chosen": -2.7135062217712402, + "logits/rejected": -2.7118725776672363, + "logps/chosen": -171.53228759765625, + "logps/rejected": -163.58480834960938, + "loss": 0.9388, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.23733428120613098, + "rewards/margins": 0.06133853644132614, + "rewards/rejected": -0.2986728250980377, + "step": 8340 + }, + { + "epoch": 2.44, + "learning_rate": 5.196556594991308e-08, + "logits/chosen": -2.708112955093384, + "logits/rejected": -2.728519916534424, + "logps/chosen": -201.09585571289062, + "logps/rejected": -194.08924865722656, + "loss": 0.9024, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.23435361683368683, + "rewards/margins": 0.12458226829767227, + "rewards/rejected": -0.3589359223842621, + "step": 8350 + }, + { + "epoch": 2.44, + "learning_rate": 5.144875889597469e-08, + "logits/chosen": -2.7321176528930664, + "logits/rejected": -2.7355644702911377, + "logps/chosen": -176.49795532226562, + "logps/rejected": -187.25527954101562, + "loss": 0.9326, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.21027982234954834, + "rewards/margins": 0.05963939428329468, + "rewards/rejected": -0.2699192464351654, + "step": 8360 + }, + { + "epoch": 2.44, + "learning_rate": 5.093423964747603e-08, + "logits/chosen": -2.7487478256225586, + "logits/rejected": -2.724884033203125, + "logps/chosen": -194.27857971191406, + "logps/rejected": -172.05706787109375, + "loss": 0.9393, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.2161370813846588, + "rewards/margins": 0.07543951272964478, + "rewards/rejected": -0.2915765643119812, + "step": 8370 + }, + { + "epoch": 2.44, + "learning_rate": 5.042201413296207e-08, + "logits/chosen": -2.7153565883636475, + "logits/rejected": -2.732156276702881, + "logps/chosen": -196.8876953125, + "logps/rejected": -196.85394287109375, + "loss": 0.9173, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.17807409167289734, + "rewards/margins": 0.09584574401378632, + "rewards/rejected": -0.27391982078552246, + "step": 8380 + }, + { + "epoch": 2.45, + "learning_rate": 4.991208825454804e-08, + "logits/chosen": -2.717339038848877, + "logits/rejected": -2.7379343509674072, + "logps/chosen": -186.6918182373047, + "logps/rejected": -189.26284790039062, + "loss": 0.9068, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2063685953617096, + "rewards/margins": 0.09851086884737015, + "rewards/rejected": -0.30487945675849915, + "step": 8390 + }, + { + "epoch": 2.45, + "learning_rate": 4.94044678878516e-08, + "logits/chosen": -2.741137742996216, + "logits/rejected": -2.757124423980713, + "logps/chosen": -203.63336181640625, + "logps/rejected": -199.83615112304688, + "loss": 0.9419, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.22201666235923767, + "rewards/margins": 0.07703028619289398, + "rewards/rejected": -0.29904693365097046, + "step": 8400 + }, + { + "epoch": 2.45, + "eval_logits/chosen": -2.6419103145599365, + "eval_logits/rejected": -2.636409044265747, + "eval_logps/chosen": -197.74330139160156, + "eval_logps/rejected": -183.8677520751953, + "eval_loss": 0.930666446685791, + "eval_rewards/accuracies": 0.5776682496070862, + "eval_rewards/chosen": -0.21379975974559784, + "eval_rewards/margins": 0.08084730058908463, + "eval_rewards/rejected": -0.29464706778526306, + "eval_runtime": 443.3983, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 8400 + }, + { + "epoch": 2.45, + "learning_rate": 4.889915888192545e-08, + "logits/chosen": -2.7031803131103516, + "logits/rejected": -2.6964354515075684, + "logps/chosen": -194.42892456054688, + "logps/rejected": -179.48733520507812, + "loss": 0.9192, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.21093186736106873, + "rewards/margins": 0.10730777680873871, + "rewards/rejected": -0.31823965907096863, + "step": 8410 + }, + { + "epoch": 2.46, + "learning_rate": 4.839616705918928e-08, + "logits/chosen": -2.7463109493255615, + "logits/rejected": -2.7112486362457275, + "logps/chosen": -203.19883728027344, + "logps/rejected": -176.5725860595703, + "loss": 0.9298, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.2048642933368683, + "rewards/margins": 0.09186164289712906, + "rewards/rejected": -0.29672592878341675, + "step": 8420 + }, + { + "epoch": 2.46, + "learning_rate": 4.7895498215363487e-08, + "logits/chosen": -2.7438952922821045, + "logits/rejected": -2.730785369873047, + "logps/chosen": -208.5506134033203, + "logps/rejected": -184.9493865966797, + "loss": 0.9244, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.18712742626667023, + "rewards/margins": 0.09391121566295624, + "rewards/rejected": -0.28103864192962646, + "step": 8430 + }, + { + "epoch": 2.46, + "learning_rate": 4.73971581194014e-08, + "logits/chosen": -2.7060279846191406, + "logits/rejected": -2.718268871307373, + "logps/chosen": -183.0648193359375, + "logps/rejected": -178.09503173828125, + "loss": 0.9307, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.21532253921031952, + "rewards/margins": 0.06491868197917938, + "rewards/rejected": -0.2802412211894989, + "step": 8440 + }, + { + "epoch": 2.46, + "learning_rate": 4.6901152513423886e-08, + "logits/chosen": -2.74855637550354, + "logits/rejected": -2.7635855674743652, + "logps/chosen": -182.1060791015625, + "logps/rejected": -195.69158935546875, + "loss": 0.8908, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.18026016652584076, + "rewards/margins": 0.11594252288341522, + "rewards/rejected": -0.29620271921157837, + "step": 8450 + }, + { + "epoch": 2.47, + "learning_rate": 4.640748711265236e-08, + "logits/chosen": -2.7476954460144043, + "logits/rejected": -2.7142105102539062, + "logps/chosen": -209.7215576171875, + "logps/rejected": -177.82785034179688, + "loss": 0.9138, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.24884888529777527, + "rewards/margins": 0.07080671936273575, + "rewards/rejected": -0.3196555972099304, + "step": 8460 + }, + { + "epoch": 2.47, + "learning_rate": 4.5916167605343394e-08, + "logits/chosen": -2.7221813201904297, + "logits/rejected": -2.6793854236602783, + "logps/chosen": -197.37985229492188, + "logps/rejected": -169.0908966064453, + "loss": 0.9334, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.22837810218334198, + "rewards/margins": 0.06554639339447021, + "rewards/rejected": -0.2939245104789734, + "step": 8470 + }, + { + "epoch": 2.47, + "learning_rate": 4.542719965272293e-08, + "logits/chosen": -2.727269172668457, + "logits/rejected": -2.7378478050231934, + "logps/chosen": -209.7412872314453, + "logps/rejected": -187.45875549316406, + "loss": 0.9353, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.23735317587852478, + "rewards/margins": 0.11680205911397934, + "rewards/rejected": -0.35415521264076233, + "step": 8480 + }, + { + "epoch": 2.48, + "learning_rate": 4.4940588888921075e-08, + "logits/chosen": -2.7760841846466064, + "logits/rejected": -2.7486133575439453, + "logps/chosen": -206.3143310546875, + "logps/rejected": -184.89443969726562, + "loss": 0.9335, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.22854101657867432, + "rewards/margins": 0.070357546210289, + "rewards/rejected": -0.2988985776901245, + "step": 8490 + }, + { + "epoch": 2.48, + "learning_rate": 4.445634092090747e-08, + "logits/chosen": -2.712855577468872, + "logits/rejected": -2.6964898109436035, + "logps/chosen": -201.7066192626953, + "logps/rejected": -184.13980102539062, + "loss": 0.9203, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.18783876299858093, + "rewards/margins": 0.0779946893453598, + "rewards/rejected": -0.26583343744277954, + "step": 8500 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -2.652738332748413, + "eval_logits/rejected": -2.6474361419677734, + "eval_logps/chosen": -197.7529296875, + "eval_logps/rejected": -183.8688201904297, + "eval_loss": 0.9313034415245056, + "eval_rewards/accuracies": 0.5834466218948364, + "eval_rewards/chosen": -0.21476346254348755, + "eval_rewards/margins": 0.07999155670404434, + "eval_rewards/rejected": -0.2947549819946289, + "eval_runtime": 443.3861, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 8500 + }, + { + "epoch": 2.48, + "learning_rate": 4.397446132842619e-08, + "logits/chosen": -2.721277952194214, + "logits/rejected": -2.7163240909576416, + "logps/chosen": -190.77566528320312, + "logps/rejected": -181.6962890625, + "loss": 0.9341, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2155207097530365, + "rewards/margins": 0.08565304428339005, + "rewards/rejected": -0.30117374658584595, + "step": 8510 + }, + { + "epoch": 2.49, + "learning_rate": 4.349495566393202e-08, + "logits/chosen": -2.753206253051758, + "logits/rejected": -2.7568376064300537, + "logps/chosen": -175.25245666503906, + "logps/rejected": -165.4966583251953, + "loss": 0.9363, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.22295379638671875, + "rewards/margins": 0.06077491119503975, + "rewards/rejected": -0.2837287187576294, + "step": 8520 + }, + { + "epoch": 2.49, + "learning_rate": 4.3017829452525714e-08, + "logits/chosen": -2.7324330806732178, + "logits/rejected": -2.7552661895751953, + "logps/chosen": -197.71481323242188, + "logps/rejected": -198.50564575195312, + "loss": 0.9214, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.21423344314098358, + "rewards/margins": 0.12743337452411652, + "rewards/rejected": -0.3416668176651001, + "step": 8530 + }, + { + "epoch": 2.49, + "learning_rate": 4.254308819189131e-08, + "logits/chosen": -2.7188706398010254, + "logits/rejected": -2.7103164196014404, + "logps/chosen": -187.2228546142578, + "logps/rejected": -171.23370361328125, + "loss": 0.9343, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.22129873931407928, + "rewards/margins": 0.0748787671327591, + "rewards/rejected": -0.29617753624916077, + "step": 8540 + }, + { + "epoch": 2.49, + "learning_rate": 4.207073735223188e-08, + "logits/chosen": -2.74505615234375, + "logits/rejected": -2.7541544437408447, + "logps/chosen": -200.2747039794922, + "logps/rejected": -189.50448608398438, + "loss": 0.9122, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19934865832328796, + "rewards/margins": 0.10086574405431747, + "rewards/rejected": -0.30021440982818604, + "step": 8550 + }, + { + "epoch": 2.5, + "learning_rate": 4.160078237620715e-08, + "logits/chosen": -2.744319438934326, + "logits/rejected": -2.755634307861328, + "logps/chosen": -188.6450653076172, + "logps/rejected": -186.31484985351562, + "loss": 0.9167, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2137790024280548, + "rewards/margins": 0.09879012405872345, + "rewards/rejected": -0.31256914138793945, + "step": 8560 + }, + { + "epoch": 2.5, + "learning_rate": 4.113322867887034e-08, + "logits/chosen": -2.745191812515259, + "logits/rejected": -2.736252546310425, + "logps/chosen": -192.93218994140625, + "logps/rejected": -176.01040649414062, + "loss": 0.9286, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.23361460864543915, + "rewards/margins": 0.0659651830792427, + "rewards/rejected": -0.29957979917526245, + "step": 8570 + }, + { + "epoch": 2.5, + "learning_rate": 4.066808164760599e-08, + "logits/chosen": -2.7102911472320557, + "logits/rejected": -2.7131974697113037, + "logps/chosen": -200.6427001953125, + "logps/rejected": -179.17141723632812, + "loss": 0.9359, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.20251154899597168, + "rewards/margins": 0.06852830946445465, + "rewards/rejected": -0.2710398733615875, + "step": 8580 + }, + { + "epoch": 2.51, + "learning_rate": 4.020534664206798e-08, + "logits/chosen": -2.7358498573303223, + "logits/rejected": -2.7214462757110596, + "logps/chosen": -195.99107360839844, + "logps/rejected": -192.43423461914062, + "loss": 0.9251, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.22206735610961914, + "rewards/margins": 0.10203574597835541, + "rewards/rejected": -0.32410308718681335, + "step": 8590 + }, + { + "epoch": 2.51, + "learning_rate": 3.974502899411741e-08, + "logits/chosen": -2.744194507598877, + "logits/rejected": -2.735948324203491, + "logps/chosen": -219.98733520507812, + "logps/rejected": -201.3661651611328, + "loss": 0.9102, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2364121973514557, + "rewards/margins": 0.10045131295919418, + "rewards/rejected": -0.3368634879589081, + "step": 8600 + }, + { + "epoch": 2.51, + "eval_logits/chosen": -2.648935079574585, + "eval_logits/rejected": -2.6435654163360596, + "eval_logps/chosen": -197.7634735107422, + "eval_logps/rejected": -183.8791046142578, + "eval_loss": 0.9314857125282288, + "eval_rewards/accuracies": 0.5820870399475098, + "eval_rewards/chosen": -0.21581508219242096, + "eval_rewards/margins": 0.0799664780497551, + "eval_rewards/rejected": -0.29578158259391785, + "eval_runtime": 443.3659, + "eval_samples_per_second": 26.536, + "eval_steps_per_second": 3.318, + "step": 8600 + }, + { + "epoch": 2.51, + "learning_rate": 3.9287134007761677e-08, + "logits/chosen": -2.763599157333374, + "logits/rejected": -2.750394821166992, + "logps/chosen": -215.42379760742188, + "logps/rejected": -189.61410522460938, + "loss": 0.9236, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.20579537749290466, + "rewards/margins": 0.09698157757520676, + "rewards/rejected": -0.3027769923210144, + "step": 8610 + }, + { + "epoch": 2.51, + "learning_rate": 3.883166695909268e-08, + "logits/chosen": -2.75989031791687, + "logits/rejected": -2.7441859245300293, + "logps/chosen": -209.43435668945312, + "logps/rejected": -182.25961303710938, + "loss": 0.9205, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2286672294139862, + "rewards/margins": 0.07702066004276276, + "rewards/rejected": -0.30568793416023254, + "step": 8620 + }, + { + "epoch": 2.52, + "learning_rate": 3.837863309622683e-08, + "logits/chosen": -2.7383246421813965, + "logits/rejected": -2.728440046310425, + "logps/chosen": -210.39956665039062, + "logps/rejected": -188.38473510742188, + "loss": 0.9276, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.21961596608161926, + "rewards/margins": 0.07722331583499908, + "rewards/rejected": -0.29683929681777954, + "step": 8630 + }, + { + "epoch": 2.52, + "learning_rate": 3.79280376392439e-08, + "logits/chosen": -2.7541236877441406, + "logits/rejected": -2.752408504486084, + "logps/chosen": -225.40658569335938, + "logps/rejected": -200.19308471679688, + "loss": 0.9193, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.19588294625282288, + "rewards/margins": 0.09243004769086838, + "rewards/rejected": -0.28831297159194946, + "step": 8640 + }, + { + "epoch": 2.52, + "learning_rate": 3.747988578012731e-08, + "logits/chosen": -2.731980323791504, + "logits/rejected": -2.744915723800659, + "logps/chosen": -200.18807983398438, + "logps/rejected": -192.57229614257812, + "loss": 0.9226, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.20432892441749573, + "rewards/margins": 0.10759834200143814, + "rewards/rejected": -0.31192725896835327, + "step": 8650 + }, + { + "epoch": 2.53, + "learning_rate": 3.703418268270406e-08, + "logits/chosen": -2.7322587966918945, + "logits/rejected": -2.740971326828003, + "logps/chosen": -217.7086181640625, + "logps/rejected": -195.62628173828125, + "loss": 0.9276, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.23085355758666992, + "rewards/margins": 0.11779451370239258, + "rewards/rejected": -0.3486481010913849, + "step": 8660 + }, + { + "epoch": 2.53, + "learning_rate": 3.659093348258521e-08, + "logits/chosen": -2.749173879623413, + "logits/rejected": -2.7466702461242676, + "logps/chosen": -196.7553253173828, + "logps/rejected": -182.85800170898438, + "loss": 0.9168, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.2209496796131134, + "rewards/margins": 0.07574382424354553, + "rewards/rejected": -0.29669347405433655, + "step": 8670 + }, + { + "epoch": 2.53, + "learning_rate": 3.615014328710706e-08, + "logits/chosen": -2.7865371704101562, + "logits/rejected": -2.7517011165618896, + "logps/chosen": -191.7059783935547, + "logps/rejected": -178.4337615966797, + "loss": 0.9019, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.2355107069015503, + "rewards/margins": 0.08629553020000458, + "rewards/rejected": -0.32180625200271606, + "step": 8680 + }, + { + "epoch": 2.53, + "learning_rate": 3.5711817175271744e-08, + "logits/chosen": -2.7685344219207764, + "logits/rejected": -2.7406980991363525, + "logps/chosen": -220.50802612304688, + "logps/rejected": -182.41189575195312, + "loss": 0.9186, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.2188243418931961, + "rewards/margins": 0.0704570859670639, + "rewards/rejected": -0.28928142786026, + "step": 8690 + }, + { + "epoch": 2.54, + "learning_rate": 3.5275960197689355e-08, + "logits/chosen": -2.738590717315674, + "logits/rejected": -2.7316675186157227, + "logps/chosen": -196.73020935058594, + "logps/rejected": -180.64486694335938, + "loss": 0.9327, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.24094291031360626, + "rewards/margins": 0.05457784980535507, + "rewards/rejected": -0.2955207824707031, + "step": 8700 + }, + { + "epoch": 2.54, + "eval_logits/chosen": -2.655764102935791, + "eval_logits/rejected": -2.6505346298217773, + "eval_logps/chosen": -197.75112915039062, + "eval_logps/rejected": -183.8668670654297, + "eval_loss": 0.9315599203109741, + "eval_rewards/accuracies": 0.582426905632019, + "eval_rewards/chosen": -0.21458037197589874, + "eval_rewards/margins": 0.07997720688581467, + "eval_rewards/rejected": -0.2945576012134552, + "eval_runtime": 443.3832, + "eval_samples_per_second": 26.535, + "eval_steps_per_second": 3.318, + "step": 8700 + }, + { + "epoch": 2.54, + "learning_rate": 3.484257737651897e-08, + "logits/chosen": -2.7013044357299805, + "logits/rejected": -2.7178680896759033, + "logps/chosen": -172.18899536132812, + "logps/rejected": -177.84359741210938, + "loss": 0.9021, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.21273639798164368, + "rewards/margins": 0.10589251667261124, + "rewards/rejected": -0.31862884759902954, + "step": 8710 + }, + { + "epoch": 2.54, + "learning_rate": 3.441167370541162e-08, + "logits/chosen": -2.7266368865966797, + "logits/rejected": -2.745483875274658, + "logps/chosen": -196.69894409179688, + "logps/rejected": -201.3649444580078, + "loss": 0.913, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19837717711925507, + "rewards/margins": 0.117425337433815, + "rewards/rejected": -0.31580251455307007, + "step": 8720 + }, + { + "epoch": 2.55, + "learning_rate": 3.398325414945208e-08, + "logits/chosen": -2.7272849082946777, + "logits/rejected": -2.738806962966919, + "logps/chosen": -187.09805297851562, + "logps/rejected": -176.56629943847656, + "loss": 0.9094, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.24703994393348694, + "rewards/margins": 0.08300556987524033, + "rewards/rejected": -0.33004552125930786, + "step": 8730 + }, + { + "epoch": 2.55, + "learning_rate": 3.355732364510208e-08, + "logits/chosen": -2.716055393218994, + "logits/rejected": -2.7345597743988037, + "logps/chosen": -167.56236267089844, + "logps/rejected": -176.9795684814453, + "loss": 0.9222, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.21685859560966492, + "rewards/margins": 0.08585675805807114, + "rewards/rejected": -0.30271536111831665, + "step": 8740 + }, + { + "epoch": 2.55, + "learning_rate": 3.31338871001432e-08, + "logits/chosen": -2.7347230911254883, + "logits/rejected": -2.741093635559082, + "logps/chosen": -210.4318389892578, + "logps/rejected": -184.84768676757812, + "loss": 0.9008, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.20497319102287292, + "rewards/margins": 0.12005607783794403, + "rewards/rejected": -0.32502925395965576, + "step": 8750 + }, + { + "epoch": 2.56, + "learning_rate": 3.2712949393620323e-08, + "logits/chosen": -2.756697177886963, + "logits/rejected": -2.735779285430908, + "logps/chosen": -201.85545349121094, + "logps/rejected": -180.489013671875, + "loss": 0.9274, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.21515175700187683, + "rewards/margins": 0.07304902374744415, + "rewards/rejected": -0.2882007956504822, + "step": 8760 + }, + { + "epoch": 2.56, + "learning_rate": 3.2294515375785644e-08, + "logits/chosen": -2.707594394683838, + "logits/rejected": -2.7309513092041016, + "logps/chosen": -163.45175170898438, + "logps/rejected": -170.62564086914062, + "loss": 0.929, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.21401043236255646, + "rewards/margins": 0.06886029243469238, + "rewards/rejected": -0.28287073969841003, + "step": 8770 + }, + { + "epoch": 2.56, + "learning_rate": 3.187858986804243e-08, + "logits/chosen": -2.7427127361297607, + "logits/rejected": -2.7254652976989746, + "logps/chosen": -209.6241912841797, + "logps/rejected": -183.1163330078125, + "loss": 0.9239, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.21841943264007568, + "rewards/margins": 0.08116400986909866, + "rewards/rejected": -0.29958346486091614, + "step": 8780 + }, + { + "epoch": 2.56, + "learning_rate": 3.146517766288992e-08, + "logits/chosen": -2.7649950981140137, + "logits/rejected": -2.734921455383301, + "logps/chosen": -218.80978393554688, + "logps/rejected": -192.93006896972656, + "loss": 0.9264, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.23358440399169922, + "rewards/margins": 0.0533999502658844, + "rewards/rejected": -0.286984384059906, + "step": 8790 + }, + { + "epoch": 2.57, + "learning_rate": 3.105428352386747e-08, + "logits/chosen": -2.7432548999786377, + "logits/rejected": -2.7458105087280273, + "logps/chosen": -202.32501220703125, + "logps/rejected": -191.49530029296875, + "loss": 0.9221, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.2086220532655716, + "rewards/margins": 0.14341671764850616, + "rewards/rejected": -0.35203877091407776, + "step": 8800 + }, + { + "epoch": 2.57, + "eval_logits/chosen": -2.670869827270508, + "eval_logits/rejected": -2.665914297103882, + "eval_logps/chosen": -197.7539520263672, + "eval_logps/rejected": -183.87417602539062, + "eval_loss": 0.9304640889167786, + "eval_rewards/accuracies": 0.5827668309211731, + "eval_rewards/chosen": -0.21486559510231018, + "eval_rewards/margins": 0.08042251318693161, + "eval_rewards/rejected": -0.2952880859375, + "eval_runtime": 443.4695, + "eval_samples_per_second": 26.529, + "eval_steps_per_second": 3.317, + "step": 8800 + }, + { + "epoch": 2.57, + "learning_rate": 3.064591218550036e-08, + "logits/chosen": -2.744394540786743, + "logits/rejected": -2.749730348587036, + "logps/chosen": -180.59036254882812, + "logps/rejected": -174.34088134765625, + "loss": 0.9156, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.233584925532341, + "rewards/margins": 0.08018484711647034, + "rewards/rejected": -0.3137698173522949, + "step": 8810 + }, + { + "epoch": 2.57, + "learning_rate": 3.024006835324475e-08, + "logits/chosen": -2.6970248222351074, + "logits/rejected": -2.721404790878296, + "logps/chosen": -180.15753173828125, + "logps/rejected": -171.57437133789062, + "loss": 0.9174, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.20657749474048615, + "rewards/margins": 0.09772597253322601, + "rewards/rejected": -0.30430346727371216, + "step": 8820 + }, + { + "epoch": 2.58, + "learning_rate": 2.983675670343372e-08, + "logits/chosen": -2.7651171684265137, + "logits/rejected": -2.737391948699951, + "logps/chosen": -200.2530975341797, + "logps/rejected": -179.17733764648438, + "loss": 0.9196, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.21371407806873322, + "rewards/margins": 0.07342345267534256, + "rewards/rejected": -0.28713753819465637, + "step": 8830 + }, + { + "epoch": 2.58, + "learning_rate": 2.9435981883223244e-08, + "logits/chosen": -2.759326219558716, + "logits/rejected": -2.7474710941314697, + "logps/chosen": -200.50869750976562, + "logps/rejected": -190.08184814453125, + "loss": 0.9159, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2350468635559082, + "rewards/margins": 0.08074475079774857, + "rewards/rejected": -0.31579163670539856, + "step": 8840 + }, + { + "epoch": 2.58, + "learning_rate": 2.9037748510538585e-08, + "logits/chosen": -2.74891996383667, + "logits/rejected": -2.754908800125122, + "logps/chosen": -193.0035400390625, + "logps/rejected": -192.65682983398438, + "loss": 0.9152, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.20041045546531677, + "rewards/margins": 0.10050807148218155, + "rewards/rejected": -0.3009185492992401, + "step": 8850 + }, + { + "epoch": 2.58, + "learning_rate": 2.864206117402146e-08, + "logits/chosen": -2.7324039936065674, + "logits/rejected": -2.7093453407287598, + "logps/chosen": -205.21630859375, + "logps/rejected": -179.09365844726562, + "loss": 0.9247, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2105673849582672, + "rewards/margins": 0.04506702348589897, + "rewards/rejected": -0.2556344270706177, + "step": 8860 + }, + { + "epoch": 2.59, + "learning_rate": 2.8248924432976577e-08, + "logits/chosen": -2.7505710124969482, + "logits/rejected": -2.723975896835327, + "logps/chosen": -229.4464874267578, + "logps/rejected": -199.12144470214844, + "loss": 0.9504, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.22895434498786926, + "rewards/margins": 0.0746496245265007, + "rewards/rejected": -0.3036039471626282, + "step": 8870 + }, + { + "epoch": 2.59, + "learning_rate": 2.7858342817319803e-08, + "logits/chosen": -2.740084409713745, + "logits/rejected": -2.7120959758758545, + "logps/chosen": -204.06201171875, + "logps/rejected": -177.44509887695312, + "loss": 0.9258, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.19777357578277588, + "rewards/margins": 0.06542792916297913, + "rewards/rejected": -0.263201504945755, + "step": 8880 + }, + { + "epoch": 2.59, + "learning_rate": 2.7470320827525123e-08, + "logits/chosen": -2.7446672916412354, + "logits/rejected": -2.7116057872772217, + "logps/chosen": -212.20352172851562, + "logps/rejected": -192.9834442138672, + "loss": 0.9102, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.22548964619636536, + "rewards/margins": 0.09153325110673904, + "rewards/rejected": -0.3170229196548462, + "step": 8890 + }, + { + "epoch": 2.6, + "learning_rate": 2.7084862934573683e-08, + "logits/chosen": -2.7336907386779785, + "logits/rejected": -2.747609853744507, + "logps/chosen": -206.77383422851562, + "logps/rejected": -205.8780517578125, + "loss": 0.8851, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.17482876777648926, + "rewards/margins": 0.12865395843982697, + "rewards/rejected": -0.3034827709197998, + "step": 8900 + }, + { + "epoch": 2.6, + "eval_logits/chosen": -2.6622135639190674, + "eval_logits/rejected": -2.65708589553833, + "eval_logps/chosen": -197.75082397460938, + "eval_logps/rejected": -183.87017822265625, + "eval_loss": 0.931524932384491, + "eval_rewards/accuracies": 0.5815771818161011, + "eval_rewards/chosen": -0.2145525962114334, + "eval_rewards/margins": 0.08033449202775955, + "eval_rewards/rejected": -0.29488709568977356, + "eval_runtime": 443.3905, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 8900 + }, + { + "epoch": 2.6, + "learning_rate": 2.670197357990156e-08, + "logits/chosen": -2.7496495246887207, + "logits/rejected": -2.767632007598877, + "logps/chosen": -188.72943115234375, + "logps/rejected": -182.79318237304688, + "loss": 0.9185, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.20827095210552216, + "rewards/margins": 0.09271486103534698, + "rewards/rejected": -0.30098584294319153, + "step": 8910 + }, + { + "epoch": 2.6, + "learning_rate": 2.632165717534901e-08, + "logits/chosen": -2.739619493484497, + "logits/rejected": -2.69582462310791, + "logps/chosen": -201.65185546875, + "logps/rejected": -174.01602172851562, + "loss": 0.9334, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.20441868901252747, + "rewards/margins": 0.06253104656934738, + "rewards/rejected": -0.26694971323013306, + "step": 8920 + }, + { + "epoch": 2.6, + "learning_rate": 2.5943918103109447e-08, + "logits/chosen": -2.7390568256378174, + "logits/rejected": -2.733745813369751, + "logps/chosen": -194.0807647705078, + "logps/rejected": -195.40028381347656, + "loss": 0.9154, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.2364007532596588, + "rewards/margins": 0.07870364189147949, + "rewards/rejected": -0.3151043951511383, + "step": 8930 + }, + { + "epoch": 2.61, + "learning_rate": 2.556876071567887e-08, + "logits/chosen": -2.729707956314087, + "logits/rejected": -2.7373297214508057, + "logps/chosen": -195.1029510498047, + "logps/rejected": -180.72091674804688, + "loss": 0.9018, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.19179031252861023, + "rewards/margins": 0.10696914047002792, + "rewards/rejected": -0.29875949025154114, + "step": 8940 + }, + { + "epoch": 2.61, + "learning_rate": 2.5196189335806083e-08, + "logits/chosen": -2.74918794631958, + "logits/rejected": -2.7424709796905518, + "logps/chosen": -210.4421844482422, + "logps/rejected": -199.2183380126953, + "loss": 0.9257, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.21995720267295837, + "rewards/margins": 0.06796015053987503, + "rewards/rejected": -0.2879173159599304, + "step": 8950 + }, + { + "epoch": 2.61, + "learning_rate": 2.482620825644241e-08, + "logits/chosen": -2.7196192741394043, + "logits/rejected": -2.7085094451904297, + "logps/chosen": -177.93862915039062, + "logps/rejected": -169.6486053466797, + "loss": 0.921, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.2068100869655609, + "rewards/margins": 0.0747736245393753, + "rewards/rejected": -0.2815837264060974, + "step": 8960 + }, + { + "epoch": 2.62, + "learning_rate": 2.4458821740692636e-08, + "logits/chosen": -2.7299575805664062, + "logits/rejected": -2.7204432487487793, + "logps/chosen": -221.1160430908203, + "logps/rejected": -197.26625061035156, + "loss": 0.9359, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.19956819713115692, + "rewards/margins": 0.10138002783060074, + "rewards/rejected": -0.30094823241233826, + "step": 8970 + }, + { + "epoch": 2.62, + "learning_rate": 2.409403402176541e-08, + "logits/chosen": -2.755445718765259, + "logits/rejected": -2.74418568611145, + "logps/chosen": -213.45803833007812, + "logps/rejected": -208.23281860351562, + "loss": 0.9075, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.20812085270881653, + "rewards/margins": 0.10565093904733658, + "rewards/rejected": -0.3137717843055725, + "step": 8980 + }, + { + "epoch": 2.62, + "learning_rate": 2.3731849302925126e-08, + "logits/chosen": -2.7552380561828613, + "logits/rejected": -2.740424633026123, + "logps/chosen": -213.5357666015625, + "logps/rejected": -198.0575408935547, + "loss": 0.9244, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.22672665119171143, + "rewards/margins": 0.07111769914627075, + "rewards/rejected": -0.2978443503379822, + "step": 8990 + }, + { + "epoch": 2.63, + "learning_rate": 2.3372271757442858e-08, + "logits/chosen": -2.7197983264923096, + "logits/rejected": -2.715674877166748, + "logps/chosen": -191.84591674804688, + "logps/rejected": -183.59007263183594, + "loss": 0.924, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.22121064364910126, + "rewards/margins": 0.08909296989440918, + "rewards/rejected": -0.31030359864234924, + "step": 9000 + }, + { + "epoch": 2.63, + "eval_logits/chosen": -2.6502623558044434, + "eval_logits/rejected": -2.6449220180511475, + "eval_logps/chosen": -197.74917602539062, + "eval_logps/rejected": -183.87181091308594, + "eval_loss": 0.9304214119911194, + "eval_rewards/accuracies": 0.5803874731063843, + "eval_rewards/chosen": -0.21438626945018768, + "eval_rewards/margins": 0.08066567778587341, + "eval_rewards/rejected": -0.2950519621372223, + "eval_runtime": 443.376, + "eval_samples_per_second": 26.535, + "eval_steps_per_second": 3.318, + "step": 9000 + }, + { + "epoch": 2.63, + "learning_rate": 2.301530552854869e-08, + "logits/chosen": -2.7371134757995605, + "logits/rejected": -2.7749149799346924, + "logps/chosen": -177.41712951660156, + "logps/rejected": -190.35281372070312, + "loss": 0.908, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.22068405151367188, + "rewards/margins": 0.09638369828462601, + "rewards/rejected": -0.3170677423477173, + "step": 9010 + }, + { + "epoch": 2.63, + "learning_rate": 2.266095472938376e-08, + "logits/chosen": -2.7563886642456055, + "logits/rejected": -2.778109073638916, + "logps/chosen": -199.50352478027344, + "logps/rejected": -190.46853637695312, + "loss": 0.9135, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.19542989134788513, + "rewards/margins": 0.11176440864801407, + "rewards/rejected": -0.307194322347641, + "step": 9020 + }, + { + "epoch": 2.63, + "learning_rate": 2.2309223442952885e-08, + "logits/chosen": -2.733668565750122, + "logits/rejected": -2.7238266468048096, + "logps/chosen": -200.59725952148438, + "logps/rejected": -185.13900756835938, + "loss": 0.9339, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.19557544589042664, + "rewards/margins": 0.06873549520969391, + "rewards/rejected": -0.26431095600128174, + "step": 9030 + }, + { + "epoch": 2.64, + "learning_rate": 2.1960115722077733e-08, + "logits/chosen": -2.767874002456665, + "logits/rejected": -2.7725155353546143, + "logps/chosen": -234.43310546875, + "logps/rejected": -232.8823699951172, + "loss": 0.9095, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.24168884754180908, + "rewards/margins": 0.10853960365056992, + "rewards/rejected": -0.350228488445282, + "step": 9040 + }, + { + "epoch": 2.64, + "learning_rate": 2.1613635589349756e-08, + "logits/chosen": -2.73789381980896, + "logits/rejected": -2.7311458587646484, + "logps/chosen": -185.69686889648438, + "logps/rejected": -173.32257080078125, + "loss": 0.9227, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.25270745158195496, + "rewards/margins": 0.06281878799200058, + "rewards/rejected": -0.3155262768268585, + "step": 9050 + }, + { + "epoch": 2.64, + "learning_rate": 2.1269787037084237e-08, + "logits/chosen": -2.7241010665893555, + "logits/rejected": -2.7217373847961426, + "logps/chosen": -201.6121368408203, + "logps/rejected": -185.01535034179688, + "loss": 0.9272, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.21401450037956238, + "rewards/margins": 0.08743976056575775, + "rewards/rejected": -0.30145424604415894, + "step": 9060 + }, + { + "epoch": 2.65, + "learning_rate": 2.0928574027273797e-08, + "logits/chosen": -2.737255811691284, + "logits/rejected": -2.6996257305145264, + "logps/chosen": -216.8815155029297, + "logps/rejected": -174.23831176757812, + "loss": 0.941, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.25054770708084106, + "rewards/margins": 0.06389566510915756, + "rewards/rejected": -0.31444334983825684, + "step": 9070 + }, + { + "epoch": 2.65, + "learning_rate": 2.0590000491543385e-08, + "logits/chosen": -2.7226431369781494, + "logits/rejected": -2.7095208168029785, + "logps/chosen": -228.7338104248047, + "logps/rejected": -202.36485290527344, + "loss": 0.9055, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.21068677306175232, + "rewards/margins": 0.10356806218624115, + "rewards/rejected": -0.31425485014915466, + "step": 9080 + }, + { + "epoch": 2.65, + "learning_rate": 2.025407033110435e-08, + "logits/chosen": -2.7291300296783447, + "logits/rejected": -2.7181403636932373, + "logps/chosen": -221.032470703125, + "logps/rejected": -185.6308135986328, + "loss": 0.9268, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.22104379534721375, + "rewards/margins": 0.07961587607860565, + "rewards/rejected": -0.3006596863269806, + "step": 9090 + }, + { + "epoch": 2.65, + "learning_rate": 1.9920787416709967e-08, + "logits/chosen": -2.725501298904419, + "logits/rejected": -2.7245450019836426, + "logps/chosen": -204.4935302734375, + "logps/rejected": -187.82420349121094, + "loss": 0.9025, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.19262897968292236, + "rewards/margins": 0.11257578432559967, + "rewards/rejected": -0.30520474910736084, + "step": 9100 + }, + { + "epoch": 2.65, + "eval_logits/chosen": -2.6463818550109863, + "eval_logits/rejected": -2.64095401763916, + "eval_logps/chosen": -197.75514221191406, + "eval_logps/rejected": -183.8715362548828, + "eval_loss": 0.9314547181129456, + "eval_rewards/accuracies": 0.5790278911590576, + "eval_rewards/chosen": -0.2149849534034729, + "eval_rewards/margins": 0.0800388753414154, + "eval_rewards/rejected": -0.2950238287448883, + "eval_runtime": 443.3623, + "eval_samples_per_second": 26.536, + "eval_steps_per_second": 3.318, + "step": 9100 + }, + { + "epoch": 2.66, + "learning_rate": 1.959015558861049e-08, + "logits/chosen": -2.7422285079956055, + "logits/rejected": -2.713832378387451, + "logps/chosen": -191.2167205810547, + "logps/rejected": -172.79429626464844, + "loss": 0.9232, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2038939744234085, + "rewards/margins": 0.08725843578577042, + "rewards/rejected": -0.29115238785743713, + "step": 9110 + }, + { + "epoch": 2.66, + "learning_rate": 1.926217865650906e-08, + "logits/chosen": -2.732168674468994, + "logits/rejected": -2.72629976272583, + "logps/chosen": -201.04891967773438, + "logps/rejected": -180.03712463378906, + "loss": 0.9082, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.21944966912269592, + "rewards/margins": 0.10418324172496796, + "rewards/rejected": -0.3236328959465027, + "step": 9120 + }, + { + "epoch": 2.66, + "learning_rate": 1.8936860399517947e-08, + "logits/chosen": -2.73362398147583, + "logits/rejected": -2.71791410446167, + "logps/chosen": -184.18553161621094, + "logps/rejected": -170.3592071533203, + "loss": 0.9322, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.2177029401063919, + "rewards/margins": 0.06241176277399063, + "rewards/rejected": -0.28011471033096313, + "step": 9130 + }, + { + "epoch": 2.67, + "learning_rate": 1.8614204566114622e-08, + "logits/chosen": -2.7433769702911377, + "logits/rejected": -2.768494129180908, + "logps/chosen": -184.1487274169922, + "logps/rejected": -171.6521759033203, + "loss": 0.9207, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2070891559123993, + "rewards/margins": 0.06774391233921051, + "rewards/rejected": -0.2748330533504486, + "step": 9140 + }, + { + "epoch": 2.67, + "learning_rate": 1.8294214874099045e-08, + "logits/chosen": -2.7814841270446777, + "logits/rejected": -2.751478672027588, + "logps/chosen": -218.31857299804688, + "logps/rejected": -192.98916625976562, + "loss": 0.9206, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.25284838676452637, + "rewards/margins": 0.07783571630716324, + "rewards/rejected": -0.33068403601646423, + "step": 9150 + }, + { + "epoch": 2.67, + "learning_rate": 1.7976895010550304e-08, + "logits/chosen": -2.7278478145599365, + "logits/rejected": -2.712846517562866, + "logps/chosen": -214.8646697998047, + "logps/rejected": -182.18746948242188, + "loss": 0.9261, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.1966797411441803, + "rewards/margins": 0.07067761570215225, + "rewards/rejected": -0.26735737919807434, + "step": 9160 + }, + { + "epoch": 2.67, + "learning_rate": 1.766224863178467e-08, + "logits/chosen": -2.701188087463379, + "logits/rejected": -2.710338830947876, + "logps/chosen": -200.46682739257812, + "logps/rejected": -189.87393188476562, + "loss": 0.9239, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.20917856693267822, + "rewards/margins": 0.08873340487480164, + "rewards/rejected": -0.29791194200515747, + "step": 9170 + }, + { + "epoch": 2.68, + "learning_rate": 1.7350279363312988e-08, + "logits/chosen": -2.704545259475708, + "logits/rejected": -2.7386326789855957, + "logps/chosen": -210.10971069335938, + "logps/rejected": -196.05384826660156, + "loss": 0.913, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2131265103816986, + "rewards/margins": 0.09158849716186523, + "rewards/rejected": -0.3047150671482086, + "step": 9180 + }, + { + "epoch": 2.68, + "learning_rate": 1.7040990799799317e-08, + "logits/chosen": -2.7536277770996094, + "logits/rejected": -2.7470672130584717, + "logps/chosen": -183.75985717773438, + "logps/rejected": -184.50489807128906, + "loss": 0.9058, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20402200520038605, + "rewards/margins": 0.10172203928232193, + "rewards/rejected": -0.3057440221309662, + "step": 9190 + }, + { + "epoch": 2.68, + "learning_rate": 1.6734386505019155e-08, + "logits/chosen": -2.7178406715393066, + "logits/rejected": -2.720010280609131, + "logps/chosen": -207.04568481445312, + "logps/rejected": -187.1564483642578, + "loss": 0.9348, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21785137057304382, + "rewards/margins": 0.08800145983695984, + "rewards/rejected": -0.30585283041000366, + "step": 9200 + }, + { + "epoch": 2.68, + "eval_logits/chosen": -2.640465021133423, + "eval_logits/rejected": -2.63492488861084, + "eval_logps/chosen": -197.74913024902344, + "eval_logps/rejected": -183.866943359375, + "eval_loss": 0.930846095085144, + "eval_rewards/accuracies": 0.5802175402641296, + "eval_rewards/chosen": -0.21438243985176086, + "eval_rewards/margins": 0.08018327504396439, + "eval_rewards/rejected": -0.29456573724746704, + "eval_runtime": 443.5095, + "eval_samples_per_second": 26.527, + "eval_steps_per_second": 3.317, + "step": 9200 + }, + { + "epoch": 2.69, + "learning_rate": 1.643047001181852e-08, + "logits/chosen": -2.707907199859619, + "logits/rejected": -2.722409248352051, + "logps/chosen": -178.1352996826172, + "logps/rejected": -183.51571655273438, + "loss": 0.9095, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.22640416026115417, + "rewards/margins": 0.11551691591739655, + "rewards/rejected": -0.3419210910797119, + "step": 9210 + }, + { + "epoch": 2.69, + "learning_rate": 1.6129244822073424e-08, + "logits/chosen": -2.707960844039917, + "logits/rejected": -2.696136951446533, + "logps/chosen": -206.0924530029297, + "logps/rejected": -188.82069396972656, + "loss": 0.9137, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.24374035000801086, + "rewards/margins": 0.07161261886358261, + "rewards/rejected": -0.3153529763221741, + "step": 9220 + }, + { + "epoch": 2.69, + "learning_rate": 1.5830714406649155e-08, + "logits/chosen": -2.6887340545654297, + "logits/rejected": -2.696369171142578, + "logps/chosen": -205.6720733642578, + "logps/rejected": -201.45242309570312, + "loss": 0.8953, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.18328827619552612, + "rewards/margins": 0.1293097734451294, + "rewards/rejected": -0.3125980794429779, + "step": 9230 + }, + { + "epoch": 2.7, + "learning_rate": 1.5534882205360645e-08, + "logits/chosen": -2.766343355178833, + "logits/rejected": -2.7414097785949707, + "logps/chosen": -202.77731323242188, + "logps/rejected": -172.8382110595703, + "loss": 0.9419, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.22233736515045166, + "rewards/margins": 0.08321109414100647, + "rewards/rejected": -0.30554842948913574, + "step": 9240 + }, + { + "epoch": 2.7, + "learning_rate": 1.5241751626932503e-08, + "logits/chosen": -2.729731559753418, + "logits/rejected": -2.709348678588867, + "logps/chosen": -196.87371826171875, + "logps/rejected": -176.45614624023438, + "loss": 0.9371, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20812328159809113, + "rewards/margins": 0.07200122624635696, + "rewards/rejected": -0.2801244854927063, + "step": 9250 + }, + { + "epoch": 2.7, + "learning_rate": 1.4951326048960073e-08, + "logits/chosen": -2.7518060207366943, + "logits/rejected": -2.7505202293395996, + "logps/chosen": -209.6000518798828, + "logps/rejected": -194.99954223632812, + "loss": 0.9341, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.22044873237609863, + "rewards/margins": 0.07619608938694, + "rewards/rejected": -0.29664483666419983, + "step": 9260 + }, + { + "epoch": 2.7, + "learning_rate": 1.4663608817870182e-08, + "logits/chosen": -2.724161148071289, + "logits/rejected": -2.7372405529022217, + "logps/chosen": -186.85226440429688, + "logps/rejected": -192.1614532470703, + "loss": 0.9183, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1896825134754181, + "rewards/margins": 0.08983322232961655, + "rewards/rejected": -0.27951571345329285, + "step": 9270 + }, + { + "epoch": 2.71, + "learning_rate": 1.4378603248882932e-08, + "logits/chosen": -2.7284820079803467, + "logits/rejected": -2.745692729949951, + "logps/chosen": -189.42337036132812, + "logps/rejected": -192.413818359375, + "loss": 0.932, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.26627618074417114, + "rewards/margins": 0.08315873891115189, + "rewards/rejected": -0.34943491220474243, + "step": 9280 + }, + { + "epoch": 2.71, + "learning_rate": 1.4096312625973162e-08, + "logits/chosen": -2.7349767684936523, + "logits/rejected": -2.7317709922790527, + "logps/chosen": -185.63861083984375, + "logps/rejected": -187.82980346679688, + "loss": 0.9297, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2128647118806839, + "rewards/margins": 0.06893138587474823, + "rewards/rejected": -0.2817961275577545, + "step": 9290 + }, + { + "epoch": 2.71, + "learning_rate": 1.3816740201832794e-08, + "logits/chosen": -2.7093145847320557, + "logits/rejected": -2.722029209136963, + "logps/chosen": -192.80197143554688, + "logps/rejected": -192.15017700195312, + "loss": 0.9067, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.20303645730018616, + "rewards/margins": 0.1047695130109787, + "rewards/rejected": -0.30780598521232605, + "step": 9300 + }, + { + "epoch": 2.71, + "eval_logits/chosen": -2.646462917327881, + "eval_logits/rejected": -2.641040086746216, + "eval_logps/chosen": -197.75985717773438, + "eval_logps/rejected": -183.88047790527344, + "eval_loss": 0.9311975240707397, + "eval_rewards/accuracies": 0.5856559872627258, + "eval_rewards/chosen": -0.21545521914958954, + "eval_rewards/margins": 0.0804641842842102, + "eval_rewards/rejected": -0.29591938853263855, + "eval_runtime": 443.4347, + "eval_samples_per_second": 26.532, + "eval_steps_per_second": 3.317, + "step": 9300 + }, + { + "epoch": 2.72, + "learning_rate": 1.3539889197833416e-08, + "logits/chosen": -2.7400126457214355, + "logits/rejected": -2.731029510498047, + "logps/chosen": -205.12112426757812, + "logps/rejected": -187.09153747558594, + "loss": 0.9299, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.22620835900306702, + "rewards/margins": 0.06900273263454437, + "rewards/rejected": -0.2952111065387726, + "step": 9310 + }, + { + "epoch": 2.72, + "learning_rate": 1.3265762803988944e-08, + "logits/chosen": -2.740659236907959, + "logits/rejected": -2.766366720199585, + "logps/chosen": -185.3249969482422, + "logps/rejected": -191.77540588378906, + "loss": 0.9077, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.20000505447387695, + "rewards/margins": 0.10223666578531265, + "rewards/rejected": -0.302241712808609, + "step": 9320 + }, + { + "epoch": 2.72, + "learning_rate": 1.299436417891911e-08, + "logits/chosen": -2.743675947189331, + "logits/rejected": -2.7326626777648926, + "logps/chosen": -195.64309692382812, + "logps/rejected": -174.15208435058594, + "loss": 0.922, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.19279178977012634, + "rewards/margins": 0.1185562014579773, + "rewards/rejected": -0.31134796142578125, + "step": 9330 + }, + { + "epoch": 2.72, + "learning_rate": 1.2725696449812779e-08, + "logits/chosen": -2.7424731254577637, + "logits/rejected": -2.733281135559082, + "logps/chosen": -202.0879364013672, + "logps/rejected": -185.03854370117188, + "loss": 0.9254, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.19211646914482117, + "rewards/margins": 0.09572719037532806, + "rewards/rejected": -0.28784364461898804, + "step": 9340 + }, + { + "epoch": 2.73, + "learning_rate": 1.2459762712392268e-08, + "logits/chosen": -2.722184419631958, + "logits/rejected": -2.7161264419555664, + "logps/chosen": -187.12063598632812, + "logps/rejected": -177.51832580566406, + "loss": 0.9, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.2033185213804245, + "rewards/margins": 0.09994320571422577, + "rewards/rejected": -0.30326172709465027, + "step": 9350 + }, + { + "epoch": 2.73, + "learning_rate": 1.2196566030877348e-08, + "logits/chosen": -2.7002055644989014, + "logits/rejected": -2.729663372039795, + "logps/chosen": -167.5603790283203, + "logps/rejected": -175.6070556640625, + "loss": 0.9287, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21544072031974792, + "rewards/margins": 0.08538533747196198, + "rewards/rejected": -0.3008260726928711, + "step": 9360 + }, + { + "epoch": 2.73, + "learning_rate": 1.1936109437950231e-08, + "logits/chosen": -2.7116270065307617, + "logits/rejected": -2.7060136795043945, + "logps/chosen": -189.29359436035156, + "logps/rejected": -184.63731384277344, + "loss": 0.9068, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2106679379940033, + "rewards/margins": 0.1083311215043068, + "rewards/rejected": -0.3189990818500519, + "step": 9370 + }, + { + "epoch": 2.74, + "learning_rate": 1.1678395934720308e-08, + "logits/chosen": -2.720411539077759, + "logits/rejected": -2.7151317596435547, + "logps/chosen": -210.22427368164062, + "logps/rejected": -194.78689575195312, + "loss": 0.9022, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.245496466755867, + "rewards/margins": 0.09903912246227264, + "rewards/rejected": -0.34453555941581726, + "step": 9380 + }, + { + "epoch": 2.74, + "learning_rate": 1.1423428490689834e-08, + "logits/chosen": -2.7146785259246826, + "logits/rejected": -2.7142322063446045, + "logps/chosen": -186.73641967773438, + "logps/rejected": -177.76785278320312, + "loss": 0.9128, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2251826971769333, + "rewards/margins": 0.09867312014102936, + "rewards/rejected": -0.32385581731796265, + "step": 9390 + }, + { + "epoch": 2.74, + "learning_rate": 1.1171210043719626e-08, + "logits/chosen": -2.7494256496429443, + "logits/rejected": -2.7391581535339355, + "logps/chosen": -223.8762664794922, + "logps/rejected": -191.97633361816406, + "loss": 0.9263, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.22717516124248505, + "rewards/margins": 0.09901183843612671, + "rewards/rejected": -0.32618698477745056, + "step": 9400 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -2.648622751235962, + "eval_logits/rejected": -2.643249750137329, + "eval_logps/chosen": -197.7535858154297, + "eval_logps/rejected": -183.8784637451172, + "eval_loss": 0.9307305216789246, + "eval_rewards/accuracies": 0.5829367637634277, + "eval_rewards/chosen": -0.2148279845714569, + "eval_rewards/margins": 0.08088845759630203, + "eval_rewards/rejected": -0.29571646451950073, + "eval_runtime": 443.3903, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 9400 + }, + { + "epoch": 2.74, + "learning_rate": 1.0921743499995139e-08, + "logits/chosen": -2.749537467956543, + "logits/rejected": -2.7279648780822754, + "logps/chosen": -220.28036499023438, + "logps/rejected": -194.126708984375, + "loss": 0.9278, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.23800167441368103, + "rewards/margins": 0.05967860668897629, + "rewards/rejected": -0.2976802885532379, + "step": 9410 + }, + { + "epoch": 2.75, + "learning_rate": 1.0675031733993144e-08, + "logits/chosen": -2.765080690383911, + "logits/rejected": -2.728818893432617, + "logps/chosen": -207.89999389648438, + "logps/rejected": -188.54592895507812, + "loss": 0.9229, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2473161667585373, + "rewards/margins": 0.0724390298128128, + "rewards/rejected": -0.3197552263736725, + "step": 9420 + }, + { + "epoch": 2.75, + "learning_rate": 1.0431077588448301e-08, + "logits/chosen": -2.7497363090515137, + "logits/rejected": -2.762665271759033, + "logps/chosen": -187.5045623779297, + "logps/rejected": -177.8409881591797, + "loss": 0.9288, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2124253511428833, + "rewards/margins": 0.09188083559274673, + "rewards/rejected": -0.3043062090873718, + "step": 9430 + }, + { + "epoch": 2.75, + "learning_rate": 1.018988387432082e-08, + "logits/chosen": -2.740898847579956, + "logits/rejected": -2.7467055320739746, + "logps/chosen": -193.87625122070312, + "logps/rejected": -191.6377410888672, + "loss": 0.9209, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2214924544095993, + "rewards/margins": 0.08042500913143158, + "rewards/rejected": -0.30191749334335327, + "step": 9440 + }, + { + "epoch": 2.76, + "learning_rate": 9.951453370763725e-09, + "logits/chosen": -2.716641426086426, + "logits/rejected": -2.7378287315368652, + "logps/chosen": -182.84176635742188, + "logps/rejected": -182.30215454101562, + "loss": 0.9041, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22670023143291473, + "rewards/margins": 0.0805177167057991, + "rewards/rejected": -0.30721792578697205, + "step": 9450 + }, + { + "epoch": 2.76, + "learning_rate": 9.715788825091053e-09, + "logits/chosen": -2.7388954162597656, + "logits/rejected": -2.7575676441192627, + "logps/chosen": -190.92247009277344, + "logps/rejected": -194.45704650878906, + "loss": 0.9242, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.2476111650466919, + "rewards/margins": 0.07143766433000565, + "rewards/rejected": -0.31904882192611694, + "step": 9460 + }, + { + "epoch": 2.76, + "learning_rate": 9.482892952746069e-09, + "logits/chosen": -2.7590038776397705, + "logits/rejected": -2.734876871109009, + "logps/chosen": -212.1120147705078, + "logps/rejected": -186.57237243652344, + "loss": 0.9293, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.21230432391166687, + "rewards/margins": 0.08338922262191772, + "rewards/rejected": -0.2956935465335846, + "step": 9470 + }, + { + "epoch": 2.77, + "learning_rate": 9.252768437269965e-09, + "logits/chosen": -2.738917827606201, + "logits/rejected": -2.7594659328460693, + "logps/chosen": -179.0370330810547, + "logps/rejected": -184.1522216796875, + "loss": 0.9011, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.22402581572532654, + "rewards/margins": 0.12652428448200226, + "rewards/rejected": -0.35055011510849, + "step": 9480 + }, + { + "epoch": 2.77, + "learning_rate": 9.025417930271062e-09, + "logits/chosen": -2.714035749435425, + "logits/rejected": -2.717360019683838, + "logps/chosen": -175.53945922851562, + "logps/rejected": -170.43405151367188, + "loss": 0.9164, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.2230546921491623, + "rewards/margins": 0.10123791545629501, + "rewards/rejected": -0.3242926597595215, + "step": 9490 + }, + { + "epoch": 2.77, + "learning_rate": 8.800844051394168e-09, + "logits/chosen": -2.7311530113220215, + "logits/rejected": -2.7537662982940674, + "logps/chosen": -188.2486572265625, + "logps/rejected": -193.36257934570312, + "loss": 0.912, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.2432141751050949, + "rewards/margins": 0.11424192041158676, + "rewards/rejected": -0.35745611786842346, + "step": 9500 + }, + { + "epoch": 2.77, + "eval_logits/chosen": -2.6494829654693604, + "eval_logits/rejected": -2.644101142883301, + "eval_logps/chosen": -197.75807189941406, + "eval_logps/rejected": -183.8787841796875, + "eval_loss": 0.9305983781814575, + "eval_rewards/accuracies": 0.5822569727897644, + "eval_rewards/chosen": -0.21527476608753204, + "eval_rewards/margins": 0.08047327399253845, + "eval_rewards/rejected": -0.2957480549812317, + "eval_runtime": 443.3997, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 9500 + }, + { + "epoch": 2.77, + "learning_rate": 8.579049388290432e-09, + "logits/chosen": -2.753450393676758, + "logits/rejected": -2.7338128089904785, + "logps/chosen": -208.67062377929688, + "logps/rejected": -187.44216918945312, + "loss": 0.9423, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.21917076408863068, + "rewards/margins": 0.09175173938274384, + "rewards/rejected": -0.3109225034713745, + "step": 9510 + }, + { + "epoch": 2.78, + "learning_rate": 8.360036496587397e-09, + "logits/chosen": -2.716602087020874, + "logits/rejected": -2.725924015045166, + "logps/chosen": -194.48460388183594, + "logps/rejected": -184.2608642578125, + "loss": 0.9136, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22760502994060516, + "rewards/margins": 0.08745179325342178, + "rewards/rejected": -0.31505683064460754, + "step": 9520 + }, + { + "epoch": 2.78, + "learning_rate": 8.143807899859828e-09, + "logits/chosen": -2.737185001373291, + "logits/rejected": -2.7138681411743164, + "logps/chosen": -226.47079467773438, + "logps/rejected": -194.29702758789062, + "loss": 0.931, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.21239247918128967, + "rewards/margins": 0.07970704883337021, + "rewards/rejected": -0.29209956526756287, + "step": 9530 + }, + { + "epoch": 2.78, + "learning_rate": 7.930366089600321e-09, + "logits/chosen": -2.7296338081359863, + "logits/rejected": -2.7105979919433594, + "logps/chosen": -194.48092651367188, + "logps/rejected": -175.62625122070312, + "loss": 0.9307, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.23873183131217957, + "rewards/margins": 0.07228745520114899, + "rewards/rejected": -0.31101930141448975, + "step": 9540 + }, + { + "epoch": 2.79, + "learning_rate": 7.719713525190851e-09, + "logits/chosen": -2.7413105964660645, + "logits/rejected": -2.740273952484131, + "logps/chosen": -199.0068359375, + "logps/rejected": -181.02638244628906, + "loss": 0.9055, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2191927433013916, + "rewards/margins": 0.1066521629691124, + "rewards/rejected": -0.3258448541164398, + "step": 9550 + }, + { + "epoch": 2.79, + "learning_rate": 7.511852633874299e-09, + "logits/chosen": -2.757634162902832, + "logits/rejected": -2.7396769523620605, + "logps/chosen": -212.4727783203125, + "logps/rejected": -192.638427734375, + "loss": 0.9276, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.23950496315956116, + "rewards/margins": 0.06648717075586319, + "rewards/rejected": -0.305992066860199, + "step": 9560 + }, + { + "epoch": 2.79, + "learning_rate": 7.3067858107264125e-09, + "logits/chosen": -2.7161240577697754, + "logits/rejected": -2.7146925926208496, + "logps/chosen": -202.38369750976562, + "logps/rejected": -185.82192993164062, + "loss": 0.9349, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.22953566908836365, + "rewards/margins": 0.08001131564378738, + "rewards/rejected": -0.3095470070838928, + "step": 9570 + }, + { + "epoch": 2.79, + "learning_rate": 7.104515418628443e-09, + "logits/chosen": -2.74570894241333, + "logits/rejected": -2.7499148845672607, + "logps/chosen": -203.444091796875, + "logps/rejected": -184.25234985351562, + "loss": 0.9122, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.21463234722614288, + "rewards/margins": 0.0944782942533493, + "rewards/rejected": -0.3091106116771698, + "step": 9580 + }, + { + "epoch": 2.8, + "learning_rate": 6.905043788239695e-09, + "logits/chosen": -2.733797311782837, + "logits/rejected": -2.738307476043701, + "logps/chosen": -205.6097869873047, + "logps/rejected": -186.2257080078125, + "loss": 0.949, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24760353565216064, + "rewards/margins": 0.06620237976312637, + "rewards/rejected": -0.3138059377670288, + "step": 9590 + }, + { + "epoch": 2.8, + "learning_rate": 6.708373217970853e-09, + "logits/chosen": -2.7177987098693848, + "logits/rejected": -2.7118101119995117, + "logps/chosen": -204.19784545898438, + "logps/rejected": -184.19650268554688, + "loss": 0.9157, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.19713929295539856, + "rewards/margins": 0.09181550145149231, + "rewards/rejected": -0.28895479440689087, + "step": 9600 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -2.649334669113159, + "eval_logits/rejected": -2.6439483165740967, + "eval_logps/chosen": -197.7744903564453, + "eval_logps/rejected": -183.8859405517578, + "eval_loss": 0.931446373462677, + "eval_rewards/accuracies": 0.5785180330276489, + "eval_rewards/chosen": -0.2169175148010254, + "eval_rewards/margins": 0.07954936474561691, + "eval_rewards/rejected": -0.2964669167995453, + "eval_runtime": 443.4093, + "eval_samples_per_second": 26.533, + "eval_steps_per_second": 3.317, + "step": 9600 + }, + { + "epoch": 2.8, + "learning_rate": 6.514505973957252e-09, + "logits/chosen": -2.7404611110687256, + "logits/rejected": -2.7086660861968994, + "logps/chosen": -198.59323120117188, + "logps/rejected": -163.24148559570312, + "loss": 0.9186, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.2203093320131302, + "rewards/margins": 0.06923945248126984, + "rewards/rejected": -0.2895487844944, + "step": 9610 + }, + { + "epoch": 2.81, + "learning_rate": 6.323444290033064e-09, + "logits/chosen": -2.7444145679473877, + "logits/rejected": -2.7581942081451416, + "logps/chosen": -183.66000366210938, + "logps/rejected": -187.50717163085938, + "loss": 0.921, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.19854137301445007, + "rewards/margins": 0.07382546365261078, + "rewards/rejected": -0.27236682176589966, + "step": 9620 + }, + { + "epoch": 2.81, + "learning_rate": 6.135190367705295e-09, + "logits/chosen": -2.729832172393799, + "logits/rejected": -2.721663236618042, + "logps/chosen": -196.37266540527344, + "logps/rejected": -175.19287109375, + "loss": 0.9094, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19448015093803406, + "rewards/margins": 0.09932075440883636, + "rewards/rejected": -0.29380089044570923, + "step": 9630 + }, + { + "epoch": 2.81, + "learning_rate": 5.949746376128662e-09, + "logits/chosen": -2.7646279335021973, + "logits/rejected": -2.739750385284424, + "logps/chosen": -203.86634826660156, + "logps/rejected": -182.5919952392578, + "loss": 0.9157, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.20224671065807343, + "rewards/margins": 0.08542544394731522, + "rewards/rejected": -0.28767213225364685, + "step": 9640 + }, + { + "epoch": 2.81, + "learning_rate": 5.767114452080363e-09, + "logits/chosen": -2.7331650257110596, + "logits/rejected": -2.707552433013916, + "logps/chosen": -195.32037353515625, + "logps/rejected": -173.34744262695312, + "loss": 0.9152, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.20608988404273987, + "rewards/margins": 0.07609757035970688, + "rewards/rejected": -0.28218746185302734, + "step": 9650 + }, + { + "epoch": 2.82, + "learning_rate": 5.587296699935629e-09, + "logits/chosen": -2.733536958694458, + "logits/rejected": -2.727839231491089, + "logps/chosen": -209.40878295898438, + "logps/rejected": -185.87734985351562, + "loss": 0.9242, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.25389450788497925, + "rewards/margins": 0.058593083173036575, + "rewards/rejected": -0.3124876022338867, + "step": 9660 + }, + { + "epoch": 2.82, + "learning_rate": 5.410295191643349e-09, + "logits/chosen": -2.7379884719848633, + "logits/rejected": -2.724625587463379, + "logps/chosen": -188.17678833007812, + "logps/rejected": -170.9092559814453, + "loss": 0.9321, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.21372437477111816, + "rewards/margins": 0.07617217302322388, + "rewards/rejected": -0.28989654779434204, + "step": 9670 + }, + { + "epoch": 2.82, + "learning_rate": 5.236111966702345e-09, + "logits/chosen": -2.724616765975952, + "logits/rejected": -2.751457691192627, + "logps/chosen": -181.1899871826172, + "logps/rejected": -180.98452758789062, + "loss": 0.9117, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.21435685455799103, + "rewards/margins": 0.09840370714664459, + "rewards/rejected": -0.31276053190231323, + "step": 9680 + }, + { + "epoch": 2.83, + "learning_rate": 5.064749032137744e-09, + "logits/chosen": -2.7458510398864746, + "logits/rejected": -2.7312254905700684, + "logps/chosen": -204.89797973632812, + "logps/rejected": -188.15606689453125, + "loss": 0.9283, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18471845984458923, + "rewards/margins": 0.10320683568716049, + "rewards/rejected": -0.2879253029823303, + "step": 9690 + }, + { + "epoch": 2.83, + "learning_rate": 4.896208362477838e-09, + "logits/chosen": -2.7029900550842285, + "logits/rejected": -2.713294267654419, + "logps/chosen": -195.61605834960938, + "logps/rejected": -188.32113647460938, + "loss": 0.9094, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.18786336481571198, + "rewards/margins": 0.10033879429101944, + "rewards/rejected": -0.28820210695266724, + "step": 9700 + }, + { + "epoch": 2.83, + "eval_logits/chosen": -2.649444580078125, + "eval_logits/rejected": -2.6440680027008057, + "eval_logps/chosen": -197.7625274658203, + "eval_logps/rejected": -183.88259887695312, + "eval_loss": 0.9309257864952087, + "eval_rewards/accuracies": 0.5831067562103271, + "eval_rewards/chosen": -0.21572300791740417, + "eval_rewards/margins": 0.08040815591812134, + "eval_rewards/rejected": -0.2961311936378479, + "eval_runtime": 443.3908, + "eval_samples_per_second": 26.534, + "eval_steps_per_second": 3.318, + "step": 9700 + }, + { + "epoch": 2.83, + "learning_rate": 4.730491899731487e-09, + "logits/chosen": -2.734896183013916, + "logits/rejected": -2.7162668704986572, + "logps/chosen": -209.939453125, + "logps/rejected": -182.87033081054688, + "loss": 0.9165, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.22276946902275085, + "rewards/margins": 0.08350035548210144, + "rewards/rejected": -0.3062697947025299, + "step": 9710 + }, + { + "epoch": 2.84, + "learning_rate": 4.567601553365608e-09, + "logits/chosen": -2.740227460861206, + "logits/rejected": -2.737812042236328, + "logps/chosen": -206.8555908203125, + "logps/rejected": -187.4201202392578, + "loss": 0.922, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.224287748336792, + "rewards/margins": 0.08504447340965271, + "rewards/rejected": -0.3093322217464447, + "step": 9720 + }, + { + "epoch": 2.84, + "learning_rate": 4.407539200283167e-09, + "logits/chosen": -2.7353711128234863, + "logits/rejected": -2.7307136058807373, + "logps/chosen": -200.0714569091797, + "logps/rejected": -183.77415466308594, + "loss": 0.923, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.19830277562141418, + "rewards/margins": 0.08112906664609909, + "rewards/rejected": -0.27943187952041626, + "step": 9730 + }, + { + "epoch": 2.84, + "learning_rate": 4.250306684801696e-09, + "logits/chosen": -2.771733045578003, + "logits/rejected": -2.7563509941101074, + "logps/chosen": -206.34060668945312, + "logps/rejected": -193.19229125976562, + "loss": 0.9129, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.23825040459632874, + "rewards/margins": 0.09162227809429169, + "rewards/rejected": -0.32987266778945923, + "step": 9740 + }, + { + "epoch": 2.84, + "learning_rate": 4.095905818631784e-09, + "logits/chosen": -2.7209630012512207, + "logits/rejected": -2.75238037109375, + "logps/chosen": -176.484130859375, + "logps/rejected": -178.44244384765625, + "loss": 0.9192, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.22507627308368683, + "rewards/margins": 0.07493752241134644, + "rewards/rejected": -0.30001378059387207, + "step": 9750 + }, + { + "epoch": 2.85, + "learning_rate": 3.9443383808565326e-09, + "logits/chosen": -2.765580177307129, + "logits/rejected": -2.7446515560150146, + "logps/chosen": -209.17422485351562, + "logps/rejected": -183.21139526367188, + "loss": 0.9416, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2306930124759674, + "rewards/margins": 0.06615431606769562, + "rewards/rejected": -0.29684728384017944, + "step": 9760 + }, + { + "epoch": 2.85, + "learning_rate": 3.795606117910743e-09, + "logits/chosen": -2.7446208000183105, + "logits/rejected": -2.7233622074127197, + "logps/chosen": -183.44345092773438, + "logps/rejected": -163.6812744140625, + "loss": 0.9275, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.231636643409729, + "rewards/margins": 0.06933009624481201, + "rewards/rejected": -0.30096670985221863, + "step": 9770 + }, + { + "epoch": 2.85, + "learning_rate": 3.6497107435610462e-09, + "logits/chosen": -2.709591865539551, + "logits/rejected": -2.7183585166931152, + "logps/chosen": -194.71524047851562, + "logps/rejected": -182.2717742919922, + "loss": 0.9439, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.22313237190246582, + "rewards/margins": 0.057538557797670364, + "rewards/rejected": -0.2806709110736847, + "step": 9780 + }, + { + "epoch": 2.86, + "learning_rate": 3.5066539388859116e-09, + "logits/chosen": -2.747555732727051, + "logits/rejected": -2.7734694480895996, + "logps/chosen": -205.0553741455078, + "logps/rejected": -202.21759033203125, + "loss": 0.9144, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2210444211959839, + "rewards/margins": 0.11726301908493042, + "rewards/rejected": -0.3383074402809143, + "step": 9790 + }, + { + "epoch": 2.86, + "learning_rate": 3.366437352256557e-09, + "logits/chosen": -2.7042324542999268, + "logits/rejected": -2.7263922691345215, + "logps/chosen": -183.5642547607422, + "logps/rejected": -185.1039276123047, + "loss": 0.9256, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.21685466170310974, + "rewards/margins": 0.10197044909000397, + "rewards/rejected": -0.3188251256942749, + "step": 9800 + }, + { + "epoch": 2.86, + "eval_logits/chosen": -2.649299144744873, + "eval_logits/rejected": -2.6439149379730225, + "eval_logps/chosen": -197.76528930664062, + "eval_logps/rejected": -183.88673400878906, + "eval_loss": 0.9303866028785706, + "eval_rewards/accuracies": 0.5837865471839905, + "eval_rewards/chosen": -0.21599791944026947, + "eval_rewards/margins": 0.08054690808057785, + "eval_rewards/rejected": -0.2965448200702667, + "eval_runtime": 443.4139, + "eval_samples_per_second": 26.533, + "eval_steps_per_second": 3.317, + "step": 9800 + }, + { + "epoch": 2.86, + "learning_rate": 3.229062599317656e-09, + "logits/chosen": -2.721593141555786, + "logits/rejected": -2.737192153930664, + "logps/chosen": -176.07333374023438, + "logps/rejected": -174.5138702392578, + "loss": 0.9269, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2209792584180832, + "rewards/margins": 0.06416002660989761, + "rewards/rejected": -0.285139262676239, + "step": 9810 + }, + { + "epoch": 2.86, + "learning_rate": 3.094531262969019e-09, + "logits/chosen": -2.754240036010742, + "logits/rejected": -2.7274465560913086, + "logps/chosen": -206.8941650390625, + "logps/rejected": -177.24270629882812, + "loss": 0.9209, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.188827782869339, + "rewards/margins": 0.09714638441801071, + "rewards/rejected": -0.2859741747379303, + "step": 9820 + }, + { + "epoch": 2.87, + "learning_rate": 2.9628448933470827e-09, + "logits/chosen": -2.7514803409576416, + "logits/rejected": -2.7271556854248047, + "logps/chosen": -219.15768432617188, + "logps/rejected": -199.83834838867188, + "loss": 0.9092, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.2370646893978119, + "rewards/margins": 0.10409317165613174, + "rewards/rejected": -0.34115785360336304, + "step": 9830 + }, + { + "epoch": 2.87, + "learning_rate": 2.8340050078072275e-09, + "logits/chosen": -2.7178354263305664, + "logits/rejected": -2.728059768676758, + "logps/chosen": -180.62649536132812, + "logps/rejected": -178.77957153320312, + "loss": 0.9101, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.1962624341249466, + "rewards/margins": 0.09259771555662155, + "rewards/rejected": -0.28886014223098755, + "step": 9840 + }, + { + "epoch": 2.87, + "learning_rate": 2.708013090906236e-09, + "logits/chosen": -2.7312958240509033, + "logits/rejected": -2.723904609680176, + "logps/chosen": -200.50428771972656, + "logps/rejected": -188.86557006835938, + "loss": 0.9225, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.2286476194858551, + "rewards/margins": 0.06230293586850166, + "rewards/rejected": -0.2909505367279053, + "step": 9850 + }, + { + "epoch": 2.88, + "learning_rate": 2.5848705943851966e-09, + "logits/chosen": -2.7521772384643555, + "logits/rejected": -2.7468342781066895, + "logps/chosen": -208.4209747314453, + "logps/rejected": -195.09420776367188, + "loss": 0.8946, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.20935706794261932, + "rewards/margins": 0.11841585487127304, + "rewards/rejected": -0.32777291536331177, + "step": 9860 + }, + { + "epoch": 2.88, + "learning_rate": 2.464578937152767e-09, + "logits/chosen": -2.707209825515747, + "logits/rejected": -2.711197853088379, + "logps/chosen": -182.0824737548828, + "logps/rejected": -170.37973022460938, + "loss": 0.9154, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2167019098997116, + "rewards/margins": 0.10509312152862549, + "rewards/rejected": -0.3217950463294983, + "step": 9870 + }, + { + "epoch": 2.88, + "learning_rate": 2.347139505268769e-09, + "logits/chosen": -2.7392303943634033, + "logits/rejected": -2.7479453086853027, + "logps/chosen": -175.31564331054688, + "logps/rejected": -187.35633850097656, + "loss": 0.9075, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.197030708193779, + "rewards/margins": 0.1033674031496048, + "rewards/rejected": -0.3003981113433838, + "step": 9880 + }, + { + "epoch": 2.88, + "learning_rate": 2.2325536519283983e-09, + "logits/chosen": -2.7220139503479004, + "logits/rejected": -2.730076789855957, + "logps/chosen": -203.31546020507812, + "logps/rejected": -197.3459014892578, + "loss": 0.8875, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.207614466547966, + "rewards/margins": 0.10640215873718262, + "rewards/rejected": -0.3140166401863098, + "step": 9890 + }, + { + "epoch": 2.89, + "learning_rate": 2.120822697446345e-09, + "logits/chosen": -2.755768299102783, + "logits/rejected": -2.7644848823547363, + "logps/chosen": -191.22451782226562, + "logps/rejected": -191.1256866455078, + "loss": 0.9287, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.21304388344287872, + "rewards/margins": 0.08656670153141022, + "rewards/rejected": -0.29961055517196655, + "step": 9900 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.6493735313415527, + "eval_logits/rejected": -2.643993616104126, + "eval_logps/chosen": -197.75450134277344, + "eval_logps/rejected": -183.87619018554688, + "eval_loss": 0.9304879307746887, + "eval_rewards/accuracies": 0.5832766890525818, + "eval_rewards/chosen": -0.21492087841033936, + "eval_rewards/margins": 0.0805683583021164, + "eval_rewards/rejected": -0.29548925161361694, + "eval_runtime": 443.2716, + "eval_samples_per_second": 26.541, + "eval_steps_per_second": 3.319, + "step": 9900 + }, + { + "epoch": 2.89, + "learning_rate": 2.0119479292419472e-09, + "logits/chosen": -2.707951784133911, + "logits/rejected": -2.7130141258239746, + "logps/chosen": -199.61636352539062, + "logps/rejected": -174.99508666992188, + "loss": 0.9208, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.18702030181884766, + "rewards/margins": 0.08625749498605728, + "rewards/rejected": -0.27327775955200195, + "step": 9910 + }, + { + "epoch": 2.89, + "learning_rate": 1.905930601823952e-09, + "logits/chosen": -2.758725643157959, + "logits/rejected": -2.730515956878662, + "logps/chosen": -210.56680297851562, + "logps/rejected": -177.89271545410156, + "loss": 0.9393, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19237715005874634, + "rewards/margins": 0.09237701445817947, + "rewards/rejected": -0.2847541868686676, + "step": 9920 + }, + { + "epoch": 2.9, + "learning_rate": 1.8027719367763871e-09, + "logits/chosen": -2.745802402496338, + "logits/rejected": -2.7457292079925537, + "logps/chosen": -208.337646484375, + "logps/rejected": -192.1747283935547, + "loss": 0.9263, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.23373830318450928, + "rewards/margins": 0.0824032798409462, + "rewards/rejected": -0.3161415457725525, + "step": 9930 + }, + { + "epoch": 2.9, + "learning_rate": 1.7024731227443523e-09, + "logits/chosen": -2.714531183242798, + "logits/rejected": -2.7135071754455566, + "logps/chosen": -175.10997009277344, + "logps/rejected": -184.83316040039062, + "loss": 0.9294, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.20745953917503357, + "rewards/margins": 0.07404839992523193, + "rewards/rejected": -0.2815079391002655, + "step": 9940 + }, + { + "epoch": 2.9, + "learning_rate": 1.6050353154202778e-09, + "logits/chosen": -2.759021043777466, + "logits/rejected": -2.7469115257263184, + "logps/chosen": -211.49734497070312, + "logps/rejected": -189.22146606445312, + "loss": 0.9401, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.2346908301115036, + "rewards/margins": 0.057391781359910965, + "rewards/rejected": -0.29208260774612427, + "step": 9950 + }, + { + "epoch": 2.91, + "learning_rate": 1.5104596375307143e-09, + "logits/chosen": -2.7347006797790527, + "logits/rejected": -2.7453768253326416, + "logps/chosen": -200.760009765625, + "logps/rejected": -187.3798828125, + "loss": 0.9056, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.22261445224285126, + "rewards/margins": 0.08408321440219879, + "rewards/rejected": -0.30669766664505005, + "step": 9960 + }, + { + "epoch": 2.91, + "learning_rate": 1.4187471788232873e-09, + "logits/chosen": -2.7443184852600098, + "logits/rejected": -2.7420456409454346, + "logps/chosen": -195.6764373779297, + "logps/rejected": -179.39016723632812, + "loss": 0.9024, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.20467519760131836, + "rewards/margins": 0.10200591385364532, + "rewards/rejected": -0.3066811263561249, + "step": 9970 + }, + { + "epoch": 2.91, + "learning_rate": 1.329898996054235e-09, + "logits/chosen": -2.72369122505188, + "logits/rejected": -2.7290115356445312, + "logps/chosen": -206.4029998779297, + "logps/rejected": -194.41281127929688, + "loss": 0.9374, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.20877929031848907, + "rewards/margins": 0.09340817481279373, + "rewards/rejected": -0.302187442779541, + "step": 9980 + }, + { + "epoch": 2.91, + "learning_rate": 1.2439161129762232e-09, + "logits/chosen": -2.751157760620117, + "logits/rejected": -2.7274537086486816, + "logps/chosen": -197.70578002929688, + "logps/rejected": -176.07757568359375, + "loss": 0.9335, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.22376060485839844, + "rewards/margins": 0.0929488092660904, + "rewards/rejected": -0.31670939922332764, + "step": 9990 + }, + { + "epoch": 2.92, + "learning_rate": 1.1607995203264943e-09, + "logits/chosen": -2.704770088195801, + "logits/rejected": -2.7196052074432373, + "logps/chosen": -191.6173095703125, + "logps/rejected": -188.82199096679688, + "loss": 0.9296, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20596985518932343, + "rewards/margins": 0.07100576907396317, + "rewards/rejected": -0.2769756019115448, + "step": 10000 + }, + { + "epoch": 2.92, + "eval_logits/chosen": -2.64931583404541, + "eval_logits/rejected": -2.6439385414123535, + "eval_logps/chosen": -197.76206970214844, + "eval_logps/rejected": -183.87411499023438, + "eval_loss": 0.9310234189033508, + "eval_rewards/accuracies": 0.5795377492904663, + "eval_rewards/chosen": -0.215674489736557, + "eval_rewards/margins": 0.07960996776819229, + "eval_rewards/rejected": -0.2952844798564911, + "eval_runtime": 443.2852, + "eval_samples_per_second": 26.54, + "eval_steps_per_second": 3.318, + "step": 10000 + }, + { + "epoch": 2.92, + "learning_rate": 1.0805501758154311e-09, + "logits/chosen": -2.756335496902466, + "logits/rejected": -2.7353618144989014, + "logps/chosen": -209.03677368164062, + "logps/rejected": -181.5239715576172, + "loss": 0.9188, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.2341901808977127, + "rewards/margins": 0.07234900444746017, + "rewards/rejected": -0.3065391480922699, + "step": 10010 + }, + { + "epoch": 2.92, + "learning_rate": 1.003169004115595e-09, + "logits/chosen": -2.767810344696045, + "logits/rejected": -2.7451963424682617, + "logps/chosen": -209.06796264648438, + "logps/rejected": -184.99880981445312, + "loss": 0.8944, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.20936338603496552, + "rewards/margins": 0.13027939200401306, + "rewards/rejected": -0.33964279294013977, + "step": 10020 + }, + { + "epoch": 2.93, + "learning_rate": 9.28656896851121e-10, + "logits/chosen": -2.74072003364563, + "logits/rejected": -2.7295937538146973, + "logps/chosen": -202.04502868652344, + "logps/rejected": -175.9278106689453, + "loss": 0.9223, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.22666554152965546, + "rewards/margins": 0.08890683203935623, + "rewards/rejected": -0.3155723512172699, + "step": 10030 + }, + { + "epoch": 2.93, + "learning_rate": 8.570147125872284e-10, + "logits/chosen": -2.743962049484253, + "logits/rejected": -2.744354248046875, + "logps/chosen": -185.90194702148438, + "logps/rejected": -182.25119018554688, + "loss": 0.9162, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.22167591750621796, + "rewards/margins": 0.09675368666648865, + "rewards/rejected": -0.3184296488761902, + "step": 10040 + }, + { + "epoch": 2.93, + "learning_rate": 7.88243276820616e-10, + "logits/chosen": -2.752854108810425, + "logits/rejected": -2.7184722423553467, + "logps/chosen": -198.3429412841797, + "logps/rejected": -167.09361267089844, + "loss": 0.9386, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.24509263038635254, + "rewards/margins": 0.05374573543667793, + "rewards/rejected": -0.2988383173942566, + "step": 10050 + }, + { + "epoch": 2.93, + "learning_rate": 7.223433819696645e-10, + "logits/chosen": -2.701383590698242, + "logits/rejected": -2.7218756675720215, + "logps/chosen": -175.10391235351562, + "logps/rejected": -178.0584259033203, + "loss": 0.9282, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.23060958087444305, + "rewards/margins": 0.07636566460132599, + "rewards/rejected": -0.30697527527809143, + "step": 10060 + }, + { + "epoch": 2.94, + "learning_rate": 6.593157873654998e-10, + "logits/chosen": -2.7408900260925293, + "logits/rejected": -2.7404873371124268, + "logps/chosen": -189.15286254882812, + "logps/rejected": -178.94862365722656, + "loss": 0.9359, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.2220519483089447, + "rewards/margins": 0.058012705296278, + "rewards/rejected": -0.2800647020339966, + "step": 10070 + }, + { + "epoch": 2.94, + "learning_rate": 5.991612192432216e-10, + "logits/chosen": -2.7326502799987793, + "logits/rejected": -2.7282063961029053, + "logps/chosen": -215.2332763671875, + "logps/rejected": -203.8656463623047, + "loss": 0.9242, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.22559790313243866, + "rewards/margins": 0.08096420019865036, + "rewards/rejected": -0.3065621256828308, + "step": 10080 + }, + { + "epoch": 2.94, + "learning_rate": 5.418803707334385e-10, + "logits/chosen": -2.7055397033691406, + "logits/rejected": -2.7279880046844482, + "logps/chosen": -189.23861694335938, + "logps/rejected": -194.57003784179688, + "loss": 0.9041, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.21456527709960938, + "rewards/margins": 0.09282436221837997, + "rewards/rejected": -0.30738964676856995, + "step": 10090 + }, + { + "epoch": 2.95, + "learning_rate": 4.874739018544128e-10, + "logits/chosen": -2.7319326400756836, + "logits/rejected": -2.7255971431732178, + "logps/chosen": -198.3050079345703, + "logps/rejected": -183.6264190673828, + "loss": 0.9335, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2186700403690338, + "rewards/margins": 0.09482128173112869, + "rewards/rejected": -0.3134912848472595, + "step": 10100 + }, + { + "epoch": 2.95, + "eval_logits/chosen": -2.649319648742676, + "eval_logits/rejected": -2.643939733505249, + "eval_logps/chosen": -197.7578125, + "eval_logps/rejected": -183.87391662597656, + "eval_loss": 0.9310972690582275, + "eval_rewards/accuracies": 0.581237256526947, + "eval_rewards/chosen": -0.21525147557258606, + "eval_rewards/margins": 0.08001116663217545, + "eval_rewards/rejected": -0.2952626645565033, + "eval_runtime": 443.3564, + "eval_samples_per_second": 26.536, + "eval_steps_per_second": 3.318, + "step": 10100 + }, + { + "epoch": 2.95, + "learning_rate": 4.3594243950428876e-10, + "logits/chosen": -2.7047019004821777, + "logits/rejected": -2.7111079692840576, + "logps/chosen": -176.3616943359375, + "logps/rejected": -184.66732788085938, + "loss": 0.9232, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21429534256458282, + "rewards/margins": 0.07300708442926407, + "rewards/rejected": -0.2873024344444275, + "step": 10110 + }, + { + "epoch": 2.95, + "learning_rate": 3.8728657745407123e-10, + "logits/chosen": -2.7484915256500244, + "logits/rejected": -2.733851194381714, + "logps/chosen": -202.58200073242188, + "logps/rejected": -190.02320861816406, + "loss": 0.9252, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2536359429359436, + "rewards/margins": 0.07397626340389252, + "rewards/rejected": -0.3276122212409973, + "step": 10120 + }, + { + "epoch": 2.95, + "learning_rate": 3.4150687634057484e-10, + "logits/chosen": -2.721923589706421, + "logits/rejected": -2.744096040725708, + "logps/chosen": -189.35020446777344, + "logps/rejected": -189.91268920898438, + "loss": 0.9022, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.20822253823280334, + "rewards/margins": 0.09155464917421341, + "rewards/rejected": -0.29977720975875854, + "step": 10130 + }, + { + "epoch": 2.96, + "learning_rate": 2.986038636601795e-10, + "logits/chosen": -2.7409708499908447, + "logits/rejected": -2.7214558124542236, + "logps/chosen": -202.751953125, + "logps/rejected": -177.0704803466797, + "loss": 0.93, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.23496882617473602, + "rewards/margins": 0.04633781313896179, + "rewards/rejected": -0.281306654214859, + "step": 10140 + }, + { + "epoch": 2.96, + "learning_rate": 2.585780337625576e-10, + "logits/chosen": -2.738527774810791, + "logits/rejected": -2.751260757446289, + "logps/chosen": -183.45188903808594, + "logps/rejected": -166.71090698242188, + "loss": 0.9162, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.189891055226326, + "rewards/margins": 0.09421588480472565, + "rewards/rejected": -0.28410694003105164, + "step": 10150 + }, + { + "epoch": 2.96, + "learning_rate": 2.2142984784506713e-10, + "logits/chosen": -2.7730815410614014, + "logits/rejected": -2.747037887573242, + "logps/chosen": -215.1787109375, + "logps/rejected": -189.70123291015625, + "loss": 0.9302, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.23184502124786377, + "rewards/margins": 0.081792913377285, + "rewards/rejected": -0.313637912273407, + "step": 10160 + }, + { + "epoch": 2.97, + "learning_rate": 1.8715973394745065e-10, + "logits/chosen": -2.740107774734497, + "logits/rejected": -2.7407820224761963, + "logps/chosen": -201.0320587158203, + "logps/rejected": -187.3585968017578, + "loss": 0.9204, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.19122305512428284, + "rewards/margins": 0.08361539244651794, + "rewards/rejected": -0.2748384475708008, + "step": 10170 + }, + { + "epoch": 2.97, + "learning_rate": 1.557680869468947e-10, + "logits/chosen": -2.7282848358154297, + "logits/rejected": -2.7195122241973877, + "logps/chosen": -199.21316528320312, + "logps/rejected": -178.7677001953125, + "loss": 0.9343, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2206597775220871, + "rewards/margins": 0.0789443626999855, + "rewards/rejected": -0.2996041476726532, + "step": 10180 + }, + { + "epoch": 2.97, + "learning_rate": 1.2725526855347778e-10, + "logits/chosen": -2.7190909385681152, + "logits/rejected": -2.721194267272949, + "logps/chosen": -170.80845642089844, + "logps/rejected": -161.698486328125, + "loss": 0.9148, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2027992457151413, + "rewards/margins": 0.08219794183969498, + "rewards/rejected": -0.28499719500541687, + "step": 10190 + }, + { + "epoch": 2.98, + "learning_rate": 1.0162160730592395e-10, + "logits/chosen": -2.729407787322998, + "logits/rejected": -2.72074294090271, + "logps/chosen": -211.5586700439453, + "logps/rejected": -196.6857147216797, + "loss": 0.9321, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.21750383079051971, + "rewards/margins": 0.06289499253034592, + "rewards/rejected": -0.28039878606796265, + "step": 10200 + }, + { + "epoch": 2.98, + "eval_logits/chosen": -2.6493217945098877, + "eval_logits/rejected": -2.64394211769104, + "eval_logps/chosen": -197.75448608398438, + "eval_logps/rejected": -183.87586975097656, + "eval_loss": 0.9304989576339722, + "eval_rewards/accuracies": 0.582426905632019, + "eval_rewards/chosen": -0.21491758525371552, + "eval_rewards/margins": 0.08054331690073013, + "eval_rewards/rejected": -0.29546090960502625, + "eval_runtime": 443.4554, + "eval_samples_per_second": 26.53, + "eval_steps_per_second": 3.317, + "step": 10200 + }, + { + "epoch": 2.98, + "learning_rate": 7.886739856796664e-11, + "logits/chosen": -2.7204930782318115, + "logits/rejected": -2.7524588108062744, + "logps/chosen": -195.971923828125, + "logps/rejected": -206.78628540039062, + "loss": 0.9028, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.2521055042743683, + "rewards/margins": 0.09555664658546448, + "rewards/rejected": -0.347662091255188, + "step": 10210 + }, + { + "epoch": 2.98, + "learning_rate": 5.899290452485162e-11, + "logits/chosen": -2.7260775566101074, + "logits/rejected": -2.7510170936584473, + "logps/chosen": -175.15951538085938, + "logps/rejected": -181.86631774902344, + "loss": 0.9093, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.20711970329284668, + "rewards/margins": 0.1187393069267273, + "rewards/rejected": -0.325859010219574, + "step": 10220 + }, + { + "epoch": 2.98, + "learning_rate": 4.199835418025599e-11, + "logits/chosen": -2.714921474456787, + "logits/rejected": -2.738410472869873, + "logps/chosen": -189.37632751464844, + "logps/rejected": -192.016845703125, + "loss": 0.9309, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.23209278285503387, + "rewards/margins": 0.10059485584497452, + "rewards/rejected": -0.332687646150589, + "step": 10230 + }, + { + "epoch": 2.99, + "learning_rate": 2.7883943353845807e-11, + "logits/chosen": -2.7257180213928223, + "logits/rejected": -2.709995746612549, + "logps/chosen": -182.7290802001953, + "logps/rejected": -177.0494384765625, + "loss": 0.9311, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.25530582666397095, + "rewards/margins": 0.062161706387996674, + "rewards/rejected": -0.31746751070022583, + "step": 10240 + }, + { + "epoch": 2.99, + "learning_rate": 1.6649834678778006e-11, + "logits/chosen": -2.72110915184021, + "logits/rejected": -2.7213776111602783, + "logps/chosen": -215.32852172851562, + "logps/rejected": -185.47840881347656, + "loss": 0.9248, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.22247782349586487, + "rewards/margins": 0.09786356985569, + "rewards/rejected": -0.3203413486480713, + "step": 10250 + }, + { + "epoch": 2.99, + "learning_rate": 8.296157600035103e-12, + "logits/chosen": -2.7592031955718994, + "logits/rejected": -2.7389073371887207, + "logps/chosen": -226.4311981201172, + "logps/rejected": -197.40982055664062, + "loss": 0.9225, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.1961437165737152, + "rewards/margins": 0.10170602798461914, + "rewards/rejected": -0.29784974455833435, + "step": 10260 + }, + { + "epoch": 3.0, + "learning_rate": 2.8230083727875944e-12, + "logits/chosen": -2.728482246398926, + "logits/rejected": -2.7387022972106934, + "logps/chosen": -210.0655059814453, + "logps/rejected": -200.79238891601562, + "loss": 0.9069, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2141614854335785, + "rewards/margins": 0.12801960110664368, + "rewards/rejected": -0.34218111634254456, + "step": 10270 + }, + { + "epoch": 3.0, + "learning_rate": 2.304500613947713e-13, + "logits/chosen": -2.7390356063842773, + "logits/rejected": -2.744161605834961, + "logps/chosen": -211.9443817138672, + "logps/rejected": -208.78115844726562, + "loss": 0.9324, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.19223544001579285, + "rewards/margins": 0.08548939228057861, + "rewards/rejected": -0.27772483229637146, + "step": 10280 + }, + { + "epoch": 3.0, + "step": 10284, + "total_flos": 0.0, + "train_loss": 0.9410081987063832, + "train_runtime": 92137.4772, + "train_samples_per_second": 7.144, + "train_steps_per_second": 0.112 + } + ], + "logging_steps": 10, + "max_steps": 10284, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}