diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9104 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 5811, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 8.591065292096219e-10, + "logits/chosen": -2.5129990577697754, + "logits/rejected": -2.4275057315826416, + "logps/chosen": -96.6673583984375, + "logps/rejected": -105.15755462646484, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 8.59106529209622e-09, + "logits/chosen": -2.988718271255493, + "logits/rejected": -2.9780874252319336, + "logps/chosen": -302.4128723144531, + "logps/rejected": -225.56951904296875, + "loss": 0.6947, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007120599504560232, + "rewards/margins": -0.004252635408192873, + "rewards/rejected": -0.002867964096367359, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -2.8921194076538086, + "logits/rejected": -2.7121551036834717, + "logps/chosen": -287.7423400878906, + "logps/rejected": -217.6292724609375, + "loss": 0.6906, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0037767409812659025, + "rewards/margins": 0.010759315453469753, + "rewards/rejected": -0.006982574705034494, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 2.5773195876288656e-08, + "logits/chosen": -3.015655994415283, + "logits/rejected": -2.9962334632873535, + "logps/chosen": -297.9928283691406, + "logps/rejected": -203.88180541992188, + "loss": 0.6891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.025857295840978622, + "rewards/margins": 0.025261688977479935, + "rewards/rejected": 0.0005956076784059405, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.8478853702545166, + "logits/rejected": -2.9214625358581543, + "logps/chosen": -267.7845153808594, + "logps/rejected": -250.1910400390625, + "loss": 0.6861, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.027253543958067894, + "rewards/margins": 0.02428315207362175, + "rewards/rejected": 0.002970390487462282, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.29553264604811e-08, + "logits/chosen": -3.0094895362854004, + "logits/rejected": -2.9605789184570312, + "logps/chosen": -322.73681640625, + "logps/rejected": -245.77450561523438, + "loss": 0.6839, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.022152891382575035, + "rewards/margins": 0.04998321458697319, + "rewards/rejected": -0.027830326929688454, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -2.9751992225646973, + "logits/rejected": -2.9595389366149902, + "logps/chosen": -308.54351806640625, + "logps/rejected": -224.53707885742188, + "loss": 0.6688, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04110954329371452, + "rewards/margins": 0.11852701753377914, + "rewards/rejected": -0.07741747796535492, + "step": 60 + }, + { + "epoch": 0.04, + "learning_rate": 6.013745704467354e-08, + "logits/chosen": -2.9823076725006104, + "logits/rejected": -3.0206565856933594, + "logps/chosen": -375.16925048828125, + "logps/rejected": -224.032958984375, + "loss": 0.6642, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.07654228806495667, + "rewards/margins": 0.1424637734889984, + "rewards/rejected": -0.06592147052288055, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -3.0697617530822754, + "logits/rejected": -3.036527156829834, + "logps/chosen": -353.6755676269531, + "logps/rejected": -216.9717559814453, + "loss": 0.6418, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04413590952754021, + "rewards/margins": 0.10847017914056778, + "rewards/rejected": -0.06433425843715668, + "step": 80 + }, + { + "epoch": 0.05, + "learning_rate": 7.731958762886598e-08, + "logits/chosen": -2.9876997470855713, + "logits/rejected": -2.9616377353668213, + "logps/chosen": -291.57012939453125, + "logps/rejected": -193.0994873046875, + "loss": 0.6367, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.08899353444576263, + "rewards/margins": 0.24147820472717285, + "rewards/rejected": -0.1524846851825714, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.8401777744293213, + "logits/rejected": -2.7715401649475098, + "logps/chosen": -261.7100524902344, + "logps/rejected": -255.4248046875, + "loss": 0.6284, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03916650265455246, + "rewards/margins": 0.1420799195766449, + "rewards/rejected": -0.10291342437267303, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.823406934738159, + "eval_logits/rejected": -2.797581672668457, + "eval_logps/chosen": -253.8098907470703, + "eval_logps/rejected": -258.8415832519531, + "eval_loss": 0.6098471879959106, + "eval_rewards/accuracies": 0.734375, + "eval_rewards/chosen": 0.04252301901578903, + "eval_rewards/margins": 0.22968964278697968, + "eval_rewards/rejected": -0.18716664612293243, + "eval_runtime": 58.4622, + "eval_samples_per_second": 17.105, + "eval_steps_per_second": 0.274, + "step": 100 + }, + { + "epoch": 0.06, + "learning_rate": 9.450171821305841e-08, + "logits/chosen": -3.0175564289093018, + "logits/rejected": -3.084195137023926, + "logps/chosen": -344.5015869140625, + "logps/rejected": -294.0466613769531, + "loss": 0.6208, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07540851831436157, + "rewards/margins": 0.3283361792564392, + "rewards/rejected": -0.25292766094207764, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -2.831212043762207, + "logits/rejected": -2.7832601070404053, + "logps/chosen": -184.1349639892578, + "logps/rejected": -206.84634399414062, + "loss": 0.587, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06816364824771881, + "rewards/margins": 0.0881614089012146, + "rewards/rejected": -0.1563250720500946, + "step": 120 + }, + { + "epoch": 0.07, + "learning_rate": 1.1168384879725086e-07, + "logits/chosen": -3.0061099529266357, + "logits/rejected": -2.8498525619506836, + "logps/chosen": -333.06072998046875, + "logps/rejected": -189.4818115234375, + "loss": 0.5832, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.19702570140361786, + "rewards/margins": 0.5247530341148376, + "rewards/rejected": -0.3277273178100586, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -2.8609023094177246, + "logits/rejected": -2.77339243888855, + "logps/chosen": -297.0363464355469, + "logps/rejected": -242.37255859375, + "loss": 0.5649, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.016815107315778732, + "rewards/margins": 0.422064870595932, + "rewards/rejected": -0.40524977445602417, + "step": 140 + }, + { + "epoch": 0.08, + "learning_rate": 1.2886597938144328e-07, + "logits/chosen": -3.088327407836914, + "logits/rejected": -2.9465346336364746, + "logps/chosen": -305.6724548339844, + "logps/rejected": -314.7848205566406, + "loss": 0.5548, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.12263361364603043, + "rewards/margins": 0.6976320147514343, + "rewards/rejected": -0.5749984979629517, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -2.883831024169922, + "logits/rejected": -2.8376777172088623, + "logps/chosen": -267.89154052734375, + "logps/rejected": -199.8636474609375, + "loss": 0.5362, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.17647962272167206, + "rewards/margins": 0.5636450052261353, + "rewards/rejected": -0.387165367603302, + "step": 160 + }, + { + "epoch": 0.09, + "learning_rate": 1.4604810996563573e-07, + "logits/chosen": -2.823948383331299, + "logits/rejected": -2.7283661365509033, + "logps/chosen": -234.5882568359375, + "logps/rejected": -194.86480712890625, + "loss": 0.4987, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09966392815113068, + "rewards/margins": 0.7896274328231812, + "rewards/rejected": -0.6899635791778564, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -2.9629111289978027, + "logits/rejected": -2.9428882598876953, + "logps/chosen": -232.97244262695312, + "logps/rejected": -183.2829132080078, + "loss": 0.5185, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.2645714282989502, + "rewards/margins": 0.8501029014587402, + "rewards/rejected": -0.5855314135551453, + "step": 180 + }, + { + "epoch": 0.1, + "learning_rate": 1.6323024054982818e-07, + "logits/chosen": -2.9642796516418457, + "logits/rejected": -2.97268009185791, + "logps/chosen": -275.6226501464844, + "logps/rejected": -233.35537719726562, + "loss": 0.5748, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1616288721561432, + "rewards/margins": 0.4936322569847107, + "rewards/rejected": -0.3320034146308899, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.9080729484558105, + "logits/rejected": -2.9043314456939697, + "logps/chosen": -282.22369384765625, + "logps/rejected": -235.44992065429688, + "loss": 0.4908, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.13387183845043182, + "rewards/margins": 0.7095439434051514, + "rewards/rejected": -0.5756720900535583, + "step": 200 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.7959609031677246, + "eval_logits/rejected": -2.7718665599823, + "eval_logps/chosen": -254.51446533203125, + "eval_logps/rejected": -263.8123779296875, + "eval_loss": 0.5425560474395752, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -0.027933437377214432, + "eval_rewards/margins": 0.6563125252723694, + "eval_rewards/rejected": -0.6842460036277771, + "eval_runtime": 58.0136, + "eval_samples_per_second": 17.237, + "eval_steps_per_second": 0.276, + "step": 200 + }, + { + "epoch": 0.11, + "learning_rate": 1.804123711340206e-07, + "logits/chosen": -2.664795160293579, + "logits/rejected": -2.427393674850464, + "logps/chosen": -297.56488037109375, + "logps/rejected": -226.8320770263672, + "loss": 0.5682, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.22673270106315613, + "rewards/margins": 0.919518768787384, + "rewards/rejected": -1.1462514400482178, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -2.8164966106414795, + "logits/rejected": -2.7533140182495117, + "logps/chosen": -316.3358459472656, + "logps/rejected": -248.8792724609375, + "loss": 0.5418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2400936633348465, + "rewards/margins": 0.5202454924583435, + "rewards/rejected": -0.7603391408920288, + "step": 220 + }, + { + "epoch": 0.12, + "learning_rate": 1.9759450171821303e-07, + "logits/chosen": -2.8455495834350586, + "logits/rejected": -2.815950870513916, + "logps/chosen": -291.536376953125, + "logps/rejected": -252.3511199951172, + "loss": 0.516, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05435393005609512, + "rewards/margins": 0.677357017993927, + "rewards/rejected": -0.7317109107971191, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -2.884962558746338, + "logits/rejected": -2.9899585247039795, + "logps/chosen": -362.83612060546875, + "logps/rejected": -246.82815551757812, + "loss": 0.5416, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4428789019584656, + "rewards/margins": 0.2312956303358078, + "rewards/rejected": -0.6741746068000793, + "step": 240 + }, + { + "epoch": 0.13, + "learning_rate": 2.1477663230240549e-07, + "logits/chosen": -2.979492425918579, + "logits/rejected": -2.9899439811706543, + "logps/chosen": -232.15756225585938, + "logps/rejected": -157.3478240966797, + "loss": 0.5141, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.26762503385543823, + "rewards/margins": 1.1515061855316162, + "rewards/rejected": -0.8838812112808228, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -3.0052077770233154, + "logits/rejected": -2.9878716468811035, + "logps/chosen": -309.3619689941406, + "logps/rejected": -189.45968627929688, + "loss": 0.5321, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4304015040397644, + "rewards/margins": 1.0413486957550049, + "rewards/rejected": -0.6109471917152405, + "step": 260 + }, + { + "epoch": 0.14, + "learning_rate": 2.3195876288659794e-07, + "logits/chosen": -2.8794291019439697, + "logits/rejected": -2.831512928009033, + "logps/chosen": -306.6054992675781, + "logps/rejected": -237.39382934570312, + "loss": 0.4909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.18433420360088348, + "rewards/margins": 0.8596351742744446, + "rewards/rejected": -0.6753008365631104, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.9824016094207764, + "logits/rejected": -2.9367408752441406, + "logps/chosen": -350.133056640625, + "logps/rejected": -254.4954071044922, + "loss": 0.5336, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.20658831298351288, + "rewards/margins": 0.7473801374435425, + "rewards/rejected": -0.5407918691635132, + "step": 280 + }, + { + "epoch": 0.15, + "learning_rate": 2.4914089347079036e-07, + "logits/chosen": -2.7565178871154785, + "logits/rejected": -2.944960832595825, + "logps/chosen": -242.48397827148438, + "logps/rejected": -227.69107055664062, + "loss": 0.5056, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.699475109577179, + "rewards/margins": 1.3677313327789307, + "rewards/rejected": -0.6682561635971069, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -2.779125928878784, + "logits/rejected": -2.943162679672241, + "logps/chosen": -411.8221130371094, + "logps/rejected": -222.3397216796875, + "loss": 0.5264, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.34889036417007446, + "rewards/margins": 1.2339386940002441, + "rewards/rejected": -0.8850483894348145, + "step": 300 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -2.812185525894165, + "eval_logits/rejected": -2.789177656173706, + "eval_logps/chosen": -253.82086181640625, + "eval_logps/rejected": -266.7626953125, + "eval_loss": 0.5323615670204163, + "eval_rewards/accuracies": 0.765625, + "eval_rewards/chosen": 0.04142449051141739, + "eval_rewards/margins": 1.0207018852233887, + "eval_rewards/rejected": -0.9792775511741638, + "eval_runtime": 59.6543, + "eval_samples_per_second": 16.763, + "eval_steps_per_second": 0.268, + "step": 300 + }, + { + "epoch": 0.16, + "learning_rate": 2.663230240549828e-07, + "logits/chosen": -2.9955785274505615, + "logits/rejected": -2.9795451164245605, + "logps/chosen": -318.2289123535156, + "logps/rejected": -216.7342071533203, + "loss": 0.5172, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.10595469176769257, + "rewards/margins": 1.0604875087738037, + "rewards/rejected": -0.9545329213142395, + "step": 310 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.9341654777526855, + "logits/rejected": -2.9966204166412354, + "logps/chosen": -379.42572021484375, + "logps/rejected": -282.17291259765625, + "loss": 0.4682, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.08291205763816833, + "rewards/margins": 1.1605613231658936, + "rewards/rejected": -1.0776493549346924, + "step": 320 + }, + { + "epoch": 0.17, + "learning_rate": 2.835051546391752e-07, + "logits/chosen": -2.8474197387695312, + "logits/rejected": -2.844364643096924, + "logps/chosen": -331.90802001953125, + "logps/rejected": -223.3518524169922, + "loss": 0.4881, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.02964567206799984, + "rewards/margins": 1.4632409811019897, + "rewards/rejected": -1.4928867816925049, + "step": 330 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.68745756149292, + "logits/rejected": -2.817155361175537, + "logps/chosen": -232.6031036376953, + "logps/rejected": -246.84768676757812, + "loss": 0.5196, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5186244249343872, + "rewards/margins": 1.0158860683441162, + "rewards/rejected": -0.49726182222366333, + "step": 340 + }, + { + "epoch": 0.18, + "learning_rate": 3.006872852233677e-07, + "logits/chosen": -3.0304269790649414, + "logits/rejected": -2.9698691368103027, + "logps/chosen": -159.0189208984375, + "logps/rejected": -212.7183380126953, + "loss": 0.4873, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.310208797454834, + "rewards/margins": 0.5256294012069702, + "rewards/rejected": -0.8358383178710938, + "step": 350 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -3.0391955375671387, + "logits/rejected": -3.0694854259490967, + "logps/chosen": -381.39715576171875, + "logps/rejected": -347.92559814453125, + "loss": 0.4607, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.43349432945251465, + "rewards/margins": 1.4752476215362549, + "rewards/rejected": -1.0417532920837402, + "step": 360 + }, + { + "epoch": 0.19, + "learning_rate": 3.178694158075601e-07, + "logits/chosen": -2.985565662384033, + "logits/rejected": -2.951699733734131, + "logps/chosen": -158.28598022460938, + "logps/rejected": -127.53106689453125, + "loss": 0.4825, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.025753701105713844, + "rewards/margins": 1.3051038980484009, + "rewards/rejected": -1.330857515335083, + "step": 370 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -2.8883204460144043, + "logits/rejected": -2.7797765731811523, + "logps/chosen": -300.84283447265625, + "logps/rejected": -306.0265197753906, + "loss": 0.5606, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03248428553342819, + "rewards/margins": 0.8653362393379211, + "rewards/rejected": -0.8328520655632019, + "step": 380 + }, + { + "epoch": 0.2, + "learning_rate": 3.3505154639175255e-07, + "logits/chosen": -2.8869693279266357, + "logits/rejected": -2.8558154106140137, + "logps/chosen": -295.94268798828125, + "logps/rejected": -245.67544555664062, + "loss": 0.4789, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0472743920981884, + "rewards/margins": 1.3127429485321045, + "rewards/rejected": -1.2654683589935303, + "step": 390 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -3.0791659355163574, + "logits/rejected": -3.0203123092651367, + "logps/chosen": -251.6421661376953, + "logps/rejected": -219.4331512451172, + "loss": 0.5536, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3093084990978241, + "rewards/margins": 0.4000861644744873, + "rewards/rejected": -0.7093946933746338, + "step": 400 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.8764305114746094, + "eval_logits/rejected": -2.8541693687438965, + "eval_logps/chosen": -254.42034912109375, + "eval_logps/rejected": -272.24603271484375, + "eval_loss": 0.4957379102706909, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -0.01852385140955448, + "eval_rewards/margins": 1.5090851783752441, + "eval_rewards/rejected": -1.5276089906692505, + "eval_runtime": 56.3835, + "eval_samples_per_second": 17.736, + "eval_steps_per_second": 0.284, + "step": 400 + }, + { + "epoch": 0.21, + "learning_rate": 3.5223367697594503e-07, + "logits/chosen": -2.8821568489074707, + "logits/rejected": -2.8333544731140137, + "logps/chosen": -320.3736267089844, + "logps/rejected": -205.11056518554688, + "loss": 0.4277, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.0024402737617492676, + "rewards/margins": 1.244533658027649, + "rewards/rejected": -1.2420933246612549, + "step": 410 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -2.9139723777770996, + "logits/rejected": -2.8578293323516846, + "logps/chosen": -301.2723693847656, + "logps/rejected": -248.91744995117188, + "loss": 0.5208, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.27292880415916443, + "rewards/margins": 0.9970871210098267, + "rewards/rejected": -1.2700159549713135, + "step": 420 + }, + { + "epoch": 0.22, + "learning_rate": 3.6941580756013745e-07, + "logits/chosen": -2.9251325130462646, + "logits/rejected": -2.8964738845825195, + "logps/chosen": -210.9687042236328, + "logps/rejected": -185.3360137939453, + "loss": 0.5229, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43344053626060486, + "rewards/margins": 0.8561422228813171, + "rewards/rejected": -1.2895828485488892, + "step": 430 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.901094436645508, + "logits/rejected": -2.8542165756225586, + "logps/chosen": -348.6666259765625, + "logps/rejected": -329.27294921875, + "loss": 0.6022, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.014251199550926685, + "rewards/margins": 0.753572940826416, + "rewards/rejected": -0.7393215298652649, + "step": 440 + }, + { + "epoch": 0.23, + "learning_rate": 3.865979381443299e-07, + "logits/chosen": -2.936382532119751, + "logits/rejected": -2.9940216541290283, + "logps/chosen": -308.2112731933594, + "logps/rejected": -232.1812744140625, + "loss": 0.504, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.3824332356452942, + "rewards/margins": 0.28821295499801636, + "rewards/rejected": -0.6706462502479553, + "step": 450 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -2.951936960220337, + "logits/rejected": -3.0050208568573, + "logps/chosen": -326.07659912109375, + "logps/rejected": -301.6195983886719, + "loss": 0.5801, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1851659119129181, + "rewards/margins": 1.5346710681915283, + "rewards/rejected": -1.349505066871643, + "step": 460 + }, + { + "epoch": 0.24, + "learning_rate": 4.037800687285223e-07, + "logits/chosen": -2.8154656887054443, + "logits/rejected": -2.8765406608581543, + "logps/chosen": -320.0531311035156, + "logps/rejected": -226.99124145507812, + "loss": 0.4564, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.22784185409545898, + "rewards/margins": 0.7857998013496399, + "rewards/rejected": -1.013641595840454, + "step": 470 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -3.0402634143829346, + "logits/rejected": -3.021247625350952, + "logps/chosen": -284.4671936035156, + "logps/rejected": -287.5126647949219, + "loss": 0.4915, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03487253934144974, + "rewards/margins": 0.45380640029907227, + "rewards/rejected": -0.4886789321899414, + "step": 480 + }, + { + "epoch": 0.25, + "learning_rate": 4.209621993127148e-07, + "logits/chosen": -2.9354074001312256, + "logits/rejected": -2.920379638671875, + "logps/chosen": -311.0786437988281, + "logps/rejected": -246.13339233398438, + "loss": 0.4588, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48857221007347107, + "rewards/margins": 0.9489312171936035, + "rewards/rejected": -1.4375033378601074, + "step": 490 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -3.123109817504883, + "logits/rejected": -3.0762407779693604, + "logps/chosen": -309.3453063964844, + "logps/rejected": -281.5166015625, + "loss": 0.5362, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25006183981895447, + "rewards/margins": 1.4525038003921509, + "rewards/rejected": -1.202441930770874, + "step": 500 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.8957557678222656, + "eval_logits/rejected": -2.87016224861145, + "eval_logps/chosen": -256.86529541015625, + "eval_logps/rejected": -272.88690185546875, + "eval_loss": 0.503109335899353, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -0.2630198001861572, + "eval_rewards/margins": 1.3286765813827515, + "eval_rewards/rejected": -1.5916962623596191, + "eval_runtime": 55.3853, + "eval_samples_per_second": 18.055, + "eval_steps_per_second": 0.289, + "step": 500 + }, + { + "epoch": 0.26, + "learning_rate": 4.381443298969072e-07, + "logits/chosen": -2.6036550998687744, + "logits/rejected": -2.6383635997772217, + "logps/chosen": -252.81375122070312, + "logps/rejected": -243.0044708251953, + "loss": 0.5633, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6361426115036011, + "rewards/margins": 0.3173540532588959, + "rewards/rejected": -0.9534965753555298, + "step": 510 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -2.96708607673645, + "logits/rejected": -3.0084481239318848, + "logps/chosen": -186.54592895507812, + "logps/rejected": -187.34884643554688, + "loss": 0.5443, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1140596866607666, + "rewards/margins": 1.242305874824524, + "rewards/rejected": -1.356365442276001, + "step": 520 + }, + { + "epoch": 0.27, + "learning_rate": 4.5532646048109964e-07, + "logits/chosen": -2.987997531890869, + "logits/rejected": -2.9607906341552734, + "logps/chosen": -292.63690185546875, + "logps/rejected": -240.1947479248047, + "loss": 0.5091, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.1345333755016327, + "rewards/margins": 1.3595573902130127, + "rewards/rejected": -1.2250239849090576, + "step": 530 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -2.8647749423980713, + "logits/rejected": -2.868330955505371, + "logps/chosen": -186.86167907714844, + "logps/rejected": -243.17910766601562, + "loss": 0.6201, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7495313882827759, + "rewards/margins": 0.6188509464263916, + "rewards/rejected": -1.3683823347091675, + "step": 540 + }, + { + "epoch": 0.28, + "learning_rate": 4.7250859106529206e-07, + "logits/chosen": -2.9979634284973145, + "logits/rejected": -2.9638993740081787, + "logps/chosen": -349.7961730957031, + "logps/rejected": -288.20062255859375, + "loss": 0.6153, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04092409461736679, + "rewards/margins": 1.4131947755813599, + "rewards/rejected": -1.4541189670562744, + "step": 550 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -3.0216901302337646, + "logits/rejected": -3.0451061725616455, + "logps/chosen": -326.0102844238281, + "logps/rejected": -307.83367919921875, + "loss": 0.539, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6230143904685974, + "rewards/margins": 0.9213398098945618, + "rewards/rejected": -1.5443540811538696, + "step": 560 + }, + { + "epoch": 0.29, + "learning_rate": 4.896907216494845e-07, + "logits/chosen": -2.990562677383423, + "logits/rejected": -2.9301705360412598, + "logps/chosen": -336.96826171875, + "logps/rejected": -225.23599243164062, + "loss": 0.5141, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8188888430595398, + "rewards/margins": 1.2901289463043213, + "rewards/rejected": -2.109017848968506, + "step": 570 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -2.900038003921509, + "logits/rejected": -2.9860446453094482, + "logps/chosen": -322.78240966796875, + "logps/rejected": -231.99667358398438, + "loss": 0.5668, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6513129472732544, + "rewards/margins": 0.49615031480789185, + "rewards/rejected": -1.147463321685791, + "step": 580 + }, + { + "epoch": 0.3, + "learning_rate": 4.992350353796136e-07, + "logits/chosen": -2.936190605163574, + "logits/rejected": -2.8864665031433105, + "logps/chosen": -251.67172241210938, + "logps/rejected": -241.36318969726562, + "loss": 0.4964, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.027965422719717026, + "rewards/margins": 1.3970218896865845, + "rewards/rejected": -1.3690563440322876, + "step": 590 + }, + { + "epoch": 0.31, + "learning_rate": 4.982788296041308e-07, + "logits/chosen": -2.9233040809631348, + "logits/rejected": -2.961263656616211, + "logps/chosen": -199.49600219726562, + "logps/rejected": -255.68612670898438, + "loss": 0.5966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13642588257789612, + "rewards/margins": 1.8362632989883423, + "rewards/rejected": -1.9726893901824951, + "step": 600 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.8986048698425293, + "eval_logits/rejected": -2.8777544498443604, + "eval_logps/chosen": -257.2279357910156, + "eval_logps/rejected": -273.46136474609375, + "eval_loss": 0.5963188409805298, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -0.29928162693977356, + "eval_rewards/margins": 1.349860429763794, + "eval_rewards/rejected": -1.6491420269012451, + "eval_runtime": 54.6151, + "eval_samples_per_second": 18.31, + "eval_steps_per_second": 0.293, + "step": 600 + }, + { + "epoch": 0.31, + "learning_rate": 4.973226238286479e-07, + "logits/chosen": -2.9391376972198486, + "logits/rejected": -2.9422051906585693, + "logps/chosen": -302.2151184082031, + "logps/rejected": -287.15606689453125, + "loss": 0.5368, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4417892098426819, + "rewards/margins": 1.2578237056732178, + "rewards/rejected": -1.6996129751205444, + "step": 610 + }, + { + "epoch": 0.32, + "learning_rate": 4.96366418053165e-07, + "logits/chosen": -3.0650055408477783, + "logits/rejected": -3.0660297870635986, + "logps/chosen": -334.0442810058594, + "logps/rejected": -252.632080078125, + "loss": 0.5857, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07565183192491531, + "rewards/margins": 0.7010248899459839, + "rewards/rejected": -0.6253730654716492, + "step": 620 + }, + { + "epoch": 0.33, + "learning_rate": 4.954102122776821e-07, + "logits/chosen": -2.918349027633667, + "logits/rejected": -2.8907716274261475, + "logps/chosen": -195.51907348632812, + "logps/rejected": -167.48745727539062, + "loss": 0.5561, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20618407428264618, + "rewards/margins": 1.305176019668579, + "rewards/rejected": -1.5113601684570312, + "step": 630 + }, + { + "epoch": 0.33, + "learning_rate": 4.944540065021993e-07, + "logits/chosen": -2.8896799087524414, + "logits/rejected": -2.9980359077453613, + "logps/chosen": -264.43023681640625, + "logps/rejected": -230.43008422851562, + "loss": 0.5386, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18526920676231384, + "rewards/margins": 1.8127784729003906, + "rewards/rejected": -1.9980475902557373, + "step": 640 + }, + { + "epoch": 0.34, + "learning_rate": 4.934978007267163e-07, + "logits/chosen": -2.959494113922119, + "logits/rejected": -2.982419967651367, + "logps/chosen": -242.4766082763672, + "logps/rejected": -274.0234680175781, + "loss": 0.5399, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2960631847381592, + "rewards/margins": 1.0227611064910889, + "rewards/rejected": -1.318824291229248, + "step": 650 + }, + { + "epoch": 0.34, + "learning_rate": 4.925415949512335e-07, + "logits/chosen": -2.9986279010772705, + "logits/rejected": -2.966939926147461, + "logps/chosen": -328.2708435058594, + "logps/rejected": -273.13006591796875, + "loss": 0.5061, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47610530257225037, + "rewards/margins": 1.19678795337677, + "rewards/rejected": -1.6728931665420532, + "step": 660 + }, + { + "epoch": 0.35, + "learning_rate": 4.915853891757506e-07, + "logits/chosen": -2.902583360671997, + "logits/rejected": -2.941610336303711, + "logps/chosen": -197.52853393554688, + "logps/rejected": -190.82029724121094, + "loss": 0.6096, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.34461653232574463, + "rewards/margins": 0.8742098808288574, + "rewards/rejected": -1.2188262939453125, + "step": 670 + }, + { + "epoch": 0.35, + "learning_rate": 4.906291834002677e-07, + "logits/chosen": -2.8424625396728516, + "logits/rejected": -2.791315793991089, + "logps/chosen": -273.73455810546875, + "logps/rejected": -229.92031860351562, + "loss": 0.5023, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6406866312026978, + "rewards/margins": 0.9865404367446899, + "rewards/rejected": -1.6272270679473877, + "step": 680 + }, + { + "epoch": 0.36, + "learning_rate": 4.896729776247848e-07, + "logits/chosen": -3.039944648742676, + "logits/rejected": -2.9114279747009277, + "logps/chosen": -344.2494201660156, + "logps/rejected": -179.3026580810547, + "loss": 0.4748, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.01536635123193264, + "rewards/margins": 1.3766069412231445, + "rewards/rejected": -1.3612406253814697, + "step": 690 + }, + { + "epoch": 0.36, + "learning_rate": 4.88716771849302e-07, + "logits/chosen": -2.9146389961242676, + "logits/rejected": -2.918255090713501, + "logps/chosen": -441.6365661621094, + "logps/rejected": -344.4063720703125, + "loss": 0.5014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2016751766204834, + "rewards/margins": 1.255171775817871, + "rewards/rejected": -1.456847071647644, + "step": 700 + }, + { + "epoch": 0.36, + "eval_logits/chosen": -2.7868502140045166, + "eval_logits/rejected": -2.7659108638763428, + "eval_logps/chosen": -257.09423828125, + "eval_logps/rejected": -271.72039794921875, + "eval_loss": 0.5382026433944702, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -0.28591296076774597, + "eval_rewards/margins": 1.189131736755371, + "eval_rewards/rejected": -1.4750447273254395, + "eval_runtime": 57.4875, + "eval_samples_per_second": 17.395, + "eval_steps_per_second": 0.278, + "step": 700 + }, + { + "epoch": 0.37, + "learning_rate": 4.87760566073819e-07, + "logits/chosen": -2.8412322998046875, + "logits/rejected": -2.9222323894500732, + "logps/chosen": -265.5148010253906, + "logps/rejected": -250.9593963623047, + "loss": 0.5254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7364897131919861, + "rewards/margins": 1.0006908178329468, + "rewards/rejected": -1.7371807098388672, + "step": 710 + }, + { + "epoch": 0.37, + "learning_rate": 4.868043602983362e-07, + "logits/chosen": -2.9580254554748535, + "logits/rejected": -2.9545352458953857, + "logps/chosen": -275.90625, + "logps/rejected": -375.8464660644531, + "loss": 0.5059, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17938612401485443, + "rewards/margins": 2.483105421066284, + "rewards/rejected": -2.662491798400879, + "step": 720 + }, + { + "epoch": 0.38, + "learning_rate": 4.858481545228533e-07, + "logits/chosen": -2.9665563106536865, + "logits/rejected": -2.962049722671509, + "logps/chosen": -315.9619140625, + "logps/rejected": -332.67608642578125, + "loss": 0.5005, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3134005665779114, + "rewards/margins": 1.136232614517212, + "rewards/rejected": -1.4496333599090576, + "step": 730 + }, + { + "epoch": 0.38, + "learning_rate": 4.848919487473704e-07, + "logits/chosen": -2.9164295196533203, + "logits/rejected": -2.856682538986206, + "logps/chosen": -322.0476379394531, + "logps/rejected": -230.6309356689453, + "loss": 0.5886, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5171477198600769, + "rewards/margins": 1.454939603805542, + "rewards/rejected": -1.9720872640609741, + "step": 740 + }, + { + "epoch": 0.39, + "learning_rate": 4.839357429718875e-07, + "logits/chosen": -2.884716510772705, + "logits/rejected": -2.9519991874694824, + "logps/chosen": -297.78839111328125, + "logps/rejected": -239.98959350585938, + "loss": 0.5356, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3978341817855835, + "rewards/margins": 1.704395055770874, + "rewards/rejected": -2.102229356765747, + "step": 750 + }, + { + "epoch": 0.39, + "learning_rate": 4.829795371964047e-07, + "logits/chosen": -2.831848621368408, + "logits/rejected": -2.817645311355591, + "logps/chosen": -249.10452270507812, + "logps/rejected": -241.6534881591797, + "loss": 0.8766, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2761456370353699, + "rewards/margins": 1.4581564664840698, + "rewards/rejected": -1.7343019247055054, + "step": 760 + }, + { + "epoch": 0.4, + "learning_rate": 4.820233314209217e-07, + "logits/chosen": -2.9512200355529785, + "logits/rejected": -2.9007842540740967, + "logps/chosen": -230.91299438476562, + "logps/rejected": -257.34375, + "loss": 0.5829, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4074572026729584, + "rewards/margins": 1.2204868793487549, + "rewards/rejected": -1.627943992614746, + "step": 770 + }, + { + "epoch": 0.4, + "learning_rate": 4.810671256454389e-07, + "logits/chosen": -2.7580349445343018, + "logits/rejected": -2.8238117694854736, + "logps/chosen": -309.89202880859375, + "logps/rejected": -203.035400390625, + "loss": 0.4978, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5273137092590332, + "rewards/margins": 1.8291202783584595, + "rewards/rejected": -2.3564341068267822, + "step": 780 + }, + { + "epoch": 0.41, + "learning_rate": 4.80110919869956e-07, + "logits/chosen": -2.884530544281006, + "logits/rejected": -2.9204657077789307, + "logps/chosen": -300.7388916015625, + "logps/rejected": -258.79180908203125, + "loss": 0.5672, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8420109748840332, + "rewards/margins": 1.2984743118286133, + "rewards/rejected": -2.1404852867126465, + "step": 790 + }, + { + "epoch": 0.41, + "learning_rate": 4.791547140944731e-07, + "logits/chosen": -2.904214859008789, + "logits/rejected": -2.9331746101379395, + "logps/chosen": -254.6400146484375, + "logps/rejected": -260.09088134765625, + "loss": 0.5334, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.006157719995826483, + "rewards/margins": 1.616097092628479, + "rewards/rejected": -1.6099392175674438, + "step": 800 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.7265069484710693, + "eval_logits/rejected": -2.705258846282959, + "eval_logps/chosen": -258.5242004394531, + "eval_logps/rejected": -275.9377746582031, + "eval_loss": 0.5677424669265747, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -0.4289108216762543, + "eval_rewards/margins": 1.4678754806518555, + "eval_rewards/rejected": -1.8967863321304321, + "eval_runtime": 55.1088, + "eval_samples_per_second": 18.146, + "eval_steps_per_second": 0.29, + "step": 800 + }, + { + "epoch": 0.42, + "learning_rate": 4.781985083189902e-07, + "logits/chosen": -2.8288321495056152, + "logits/rejected": -2.7795658111572266, + "logps/chosen": -183.28457641601562, + "logps/rejected": -241.27743530273438, + "loss": 0.557, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7061542868614197, + "rewards/margins": 1.3368699550628662, + "rewards/rejected": -2.0430245399475098, + "step": 810 + }, + { + "epoch": 0.42, + "learning_rate": 4.772423025435074e-07, + "logits/chosen": -3.0099616050720215, + "logits/rejected": -2.973783016204834, + "logps/chosen": -186.28518676757812, + "logps/rejected": -266.48236083984375, + "loss": 0.6266, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.71001797914505, + "rewards/margins": -0.206703782081604, + "rewards/rejected": -0.5033141374588013, + "step": 820 + }, + { + "epoch": 0.43, + "learning_rate": 4.762860967680244e-07, + "logits/chosen": -2.9518191814422607, + "logits/rejected": -3.006854772567749, + "logps/chosen": -195.8343048095703, + "logps/rejected": -227.02340698242188, + "loss": 0.5728, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15900671482086182, + "rewards/margins": 0.9850654602050781, + "rewards/rejected": -1.1440720558166504, + "step": 830 + }, + { + "epoch": 0.43, + "learning_rate": 4.7532989099254154e-07, + "logits/chosen": -3.009342670440674, + "logits/rejected": -3.0587260723114014, + "logps/chosen": -300.0588684082031, + "logps/rejected": -257.58203125, + "loss": 0.6019, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6498911380767822, + "rewards/margins": 0.7764835953712463, + "rewards/rejected": -1.4263746738433838, + "step": 840 + }, + { + "epoch": 0.44, + "learning_rate": 4.7437368521705866e-07, + "logits/chosen": -2.902837038040161, + "logits/rejected": -2.8557207584381104, + "logps/chosen": -246.87142944335938, + "logps/rejected": -213.7313232421875, + "loss": 0.5167, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.49609607458114624, + "rewards/margins": 1.5304511785507202, + "rewards/rejected": -2.026547431945801, + "step": 850 + }, + { + "epoch": 0.44, + "learning_rate": 4.7341747944157577e-07, + "logits/chosen": -2.726759910583496, + "logits/rejected": -2.728843927383423, + "logps/chosen": -199.48330688476562, + "logps/rejected": -235.99014282226562, + "loss": 0.5803, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1567280292510986, + "rewards/margins": 1.015815258026123, + "rewards/rejected": -2.1725430488586426, + "step": 860 + }, + { + "epoch": 0.45, + "learning_rate": 4.724612736660929e-07, + "logits/chosen": -2.89784836769104, + "logits/rejected": -2.9295287132263184, + "logps/chosen": -265.757080078125, + "logps/rejected": -193.9804229736328, + "loss": 0.482, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.018481796607375145, + "rewards/margins": 2.08237361907959, + "rewards/rejected": -2.063891887664795, + "step": 870 + }, + { + "epoch": 0.45, + "learning_rate": 4.7150506789061006e-07, + "logits/chosen": -2.8157646656036377, + "logits/rejected": -2.831799268722534, + "logps/chosen": -235.80184936523438, + "logps/rejected": -296.13421630859375, + "loss": 0.5127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5655059814453125, + "rewards/margins": 1.134603500366211, + "rewards/rejected": -1.7001097202301025, + "step": 880 + }, + { + "epoch": 0.46, + "learning_rate": 4.7054886211512717e-07, + "logits/chosen": -2.9849319458007812, + "logits/rejected": -2.9874143600463867, + "logps/chosen": -297.6209411621094, + "logps/rejected": -262.95428466796875, + "loss": 0.5396, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07225757837295532, + "rewards/margins": 0.5653451085090637, + "rewards/rejected": -0.6376025676727295, + "step": 890 + }, + { + "epoch": 0.46, + "learning_rate": 4.695926563396443e-07, + "logits/chosen": -3.043614149093628, + "logits/rejected": -3.0626580715179443, + "logps/chosen": -243.42160034179688, + "logps/rejected": -207.7015838623047, + "loss": 0.5251, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3596685528755188, + "rewards/margins": 1.1817331314086914, + "rewards/rejected": -1.5414015054702759, + "step": 900 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -2.8661580085754395, + "eval_logits/rejected": -2.846320629119873, + "eval_logps/chosen": -256.3507080078125, + "eval_logps/rejected": -270.0767822265625, + "eval_loss": 0.5772436261177063, + "eval_rewards/accuracies": 0.734375, + "eval_rewards/chosen": -0.21155984699726105, + "eval_rewards/margins": 1.0991249084472656, + "eval_rewards/rejected": -1.3106846809387207, + "eval_runtime": 58.614, + "eval_samples_per_second": 17.061, + "eval_steps_per_second": 0.273, + "step": 900 + }, + { + "epoch": 0.47, + "learning_rate": 4.686364505641614e-07, + "logits/chosen": -3.027421474456787, + "logits/rejected": -3.1281371116638184, + "logps/chosen": -293.70989990234375, + "logps/rejected": -189.66464233398438, + "loss": 0.4978, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41769805550575256, + "rewards/margins": 1.199561357498169, + "rewards/rejected": -1.6172593832015991, + "step": 910 + }, + { + "epoch": 0.47, + "learning_rate": 4.676802447886785e-07, + "logits/chosen": -2.8584070205688477, + "logits/rejected": -2.882302761077881, + "logps/chosen": -256.07684326171875, + "logps/rejected": -255.83047485351562, + "loss": 0.5412, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.33531659841537476, + "rewards/margins": 0.8404865264892578, + "rewards/rejected": -1.1758031845092773, + "step": 920 + }, + { + "epoch": 0.48, + "learning_rate": 4.6672403901319564e-07, + "logits/chosen": -2.929386854171753, + "logits/rejected": -3.0053086280822754, + "logps/chosen": -316.0078125, + "logps/rejected": -171.36656188964844, + "loss": 0.4762, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.023432254791259766, + "rewards/margins": 2.0073459148406982, + "rewards/rejected": -1.9839136600494385, + "step": 930 + }, + { + "epoch": 0.49, + "learning_rate": 4.6576783323771275e-07, + "logits/chosen": -2.8826663494110107, + "logits/rejected": -2.8366870880126953, + "logps/chosen": -243.7962188720703, + "logps/rejected": -187.9961700439453, + "loss": 0.503, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2530770003795624, + "rewards/margins": 1.1314551830291748, + "rewards/rejected": -1.3845322132110596, + "step": 940 + }, + { + "epoch": 0.49, + "learning_rate": 4.6481162746222987e-07, + "logits/chosen": -2.8165650367736816, + "logits/rejected": -2.9121110439300537, + "logps/chosen": -251.54098510742188, + "logps/rejected": -237.3175506591797, + "loss": 0.8784, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2420404851436615, + "rewards/margins": 2.1679394245147705, + "rewards/rejected": -2.409980058670044, + "step": 950 + }, + { + "epoch": 0.5, + "learning_rate": 4.63855421686747e-07, + "logits/chosen": -2.813908100128174, + "logits/rejected": -2.8820648193359375, + "logps/chosen": -274.791748046875, + "logps/rejected": -240.8386993408203, + "loss": 0.5649, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04944751411676407, + "rewards/margins": 1.3660838603973389, + "rewards/rejected": -1.4155313968658447, + "step": 960 + }, + { + "epoch": 0.5, + "learning_rate": 4.628992159112641e-07, + "logits/chosen": -2.7781646251678467, + "logits/rejected": -2.8930909633636475, + "logps/chosen": -328.9050598144531, + "logps/rejected": -236.53414916992188, + "loss": 0.5675, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06258317828178406, + "rewards/margins": 1.6145604848861694, + "rewards/rejected": -1.6771436929702759, + "step": 970 + }, + { + "epoch": 0.51, + "learning_rate": 4.6194301013578116e-07, + "logits/chosen": -2.9083309173583984, + "logits/rejected": -2.824375629425049, + "logps/chosen": -287.6618347167969, + "logps/rejected": -230.19393920898438, + "loss": 0.5169, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4656705856323242, + "rewards/margins": 1.6011505126953125, + "rewards/rejected": -2.0668210983276367, + "step": 980 + }, + { + "epoch": 0.51, + "learning_rate": 4.609868043602983e-07, + "logits/chosen": -2.937588691711426, + "logits/rejected": -2.896270275115967, + "logps/chosen": -275.5927734375, + "logps/rejected": -288.66680908203125, + "loss": 0.4917, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19148771464824677, + "rewards/margins": 1.2772417068481445, + "rewards/rejected": -1.4687296152114868, + "step": 990 + }, + { + "epoch": 0.52, + "learning_rate": 4.600305985848154e-07, + "logits/chosen": -2.9878718852996826, + "logits/rejected": -3.0833239555358887, + "logps/chosen": -213.8026123046875, + "logps/rejected": -214.43362426757812, + "loss": 0.5205, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5573463439941406, + "rewards/margins": 0.5783860087394714, + "rewards/rejected": -1.1357324123382568, + "step": 1000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.7978734970092773, + "eval_logits/rejected": -2.78934907913208, + "eval_logps/chosen": -258.027587890625, + "eval_logps/rejected": -275.55523681640625, + "eval_loss": 0.5262419581413269, + "eval_rewards/accuracies": 0.71875, + "eval_rewards/chosen": -0.37924808263778687, + "eval_rewards/margins": 1.479280710220337, + "eval_rewards/rejected": -1.858528733253479, + "eval_runtime": 57.7979, + "eval_samples_per_second": 17.302, + "eval_steps_per_second": 0.277, + "step": 1000 + }, + { + "epoch": 0.52, + "learning_rate": 4.590743928093325e-07, + "logits/chosen": -2.8478896617889404, + "logits/rejected": -2.786147117614746, + "logps/chosen": -383.54327392578125, + "logps/rejected": -270.4455261230469, + "loss": 0.5498, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0184290409088135, + "rewards/margins": 0.9436414837837219, + "rewards/rejected": -1.9620707035064697, + "step": 1010 + }, + { + "epoch": 0.53, + "learning_rate": 4.581181870338497e-07, + "logits/chosen": -2.8463029861450195, + "logits/rejected": -2.900444746017456, + "logps/chosen": -327.9524841308594, + "logps/rejected": -274.52862548828125, + "loss": 0.5847, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.27039843797683716, + "rewards/margins": 1.4945565462112427, + "rewards/rejected": -1.7649548053741455, + "step": 1020 + }, + { + "epoch": 0.53, + "learning_rate": 4.571619812583668e-07, + "logits/chosen": -2.793391466140747, + "logits/rejected": -2.795802354812622, + "logps/chosen": -267.8859558105469, + "logps/rejected": -217.9220733642578, + "loss": 0.4481, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3454091250896454, + "rewards/margins": 2.0243167877197266, + "rewards/rejected": -2.3697259426116943, + "step": 1030 + }, + { + "epoch": 0.54, + "learning_rate": 4.562057754828839e-07, + "logits/chosen": -2.868319034576416, + "logits/rejected": -2.905986785888672, + "logps/chosen": -264.89349365234375, + "logps/rejected": -310.06231689453125, + "loss": 0.5553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33068472146987915, + "rewards/margins": 2.051600933074951, + "rewards/rejected": -2.3822855949401855, + "step": 1040 + }, + { + "epoch": 0.54, + "learning_rate": 4.55249569707401e-07, + "logits/chosen": -2.834726572036743, + "logits/rejected": -2.9254660606384277, + "logps/chosen": -293.556884765625, + "logps/rejected": -234.21005249023438, + "loss": 0.5042, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1473982334136963, + "rewards/margins": 1.1652801036834717, + "rewards/rejected": -2.312678337097168, + "step": 1050 + }, + { + "epoch": 0.55, + "learning_rate": 4.5429336393191814e-07, + "logits/chosen": -2.7575011253356934, + "logits/rejected": -2.8820366859436035, + "logps/chosen": -322.6793518066406, + "logps/rejected": -208.82388305664062, + "loss": 0.6648, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6877565383911133, + "rewards/margins": 0.889872670173645, + "rewards/rejected": -1.5776290893554688, + "step": 1060 + }, + { + "epoch": 0.55, + "learning_rate": 4.5333715815643525e-07, + "logits/chosen": -2.8447697162628174, + "logits/rejected": -2.8715322017669678, + "logps/chosen": -386.28570556640625, + "logps/rejected": -268.4273376464844, + "loss": 0.5397, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.26172274351119995, + "rewards/margins": 1.4582087993621826, + "rewards/rejected": -1.7199318408966064, + "step": 1070 + }, + { + "epoch": 0.56, + "learning_rate": 4.5238095238095237e-07, + "logits/chosen": -2.957181215286255, + "logits/rejected": -2.9968810081481934, + "logps/chosen": -263.70684814453125, + "logps/rejected": -233.9396209716797, + "loss": 0.5506, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8327061533927917, + "rewards/margins": 0.7109770774841309, + "rewards/rejected": -1.5436832904815674, + "step": 1080 + }, + { + "epoch": 0.56, + "learning_rate": 4.514247466054695e-07, + "logits/chosen": -2.9863791465759277, + "logits/rejected": -2.9521121978759766, + "logps/chosen": -245.79244995117188, + "logps/rejected": -189.44338989257812, + "loss": 0.5105, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5945242643356323, + "rewards/margins": 1.325272560119629, + "rewards/rejected": -1.9197969436645508, + "step": 1090 + }, + { + "epoch": 0.57, + "learning_rate": 4.504685408299866e-07, + "logits/chosen": -2.892086982727051, + "logits/rejected": -2.942537784576416, + "logps/chosen": -336.52685546875, + "logps/rejected": -304.50567626953125, + "loss": 0.5094, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14167115092277527, + "rewards/margins": 0.7381815314292908, + "rewards/rejected": -0.8798527717590332, + "step": 1100 + }, + { + "epoch": 0.57, + "eval_logits/chosen": -2.753582715988159, + "eval_logits/rejected": -2.7452518939971924, + "eval_logps/chosen": -260.51361083984375, + "eval_logps/rejected": -276.33770751953125, + "eval_loss": 0.5432895421981812, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -0.6278508901596069, + "eval_rewards/margins": 1.3089274168014526, + "eval_rewards/rejected": -1.93677818775177, + "eval_runtime": 53.4701, + "eval_samples_per_second": 18.702, + "eval_steps_per_second": 0.299, + "step": 1100 + }, + { + "epoch": 0.57, + "learning_rate": 4.495123350545037e-07, + "logits/chosen": -2.9081952571868896, + "logits/rejected": -2.9619812965393066, + "logps/chosen": -278.28076171875, + "logps/rejected": -240.11181640625, + "loss": 0.5065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.323709636926651, + "rewards/margins": 0.9985870122909546, + "rewards/rejected": -1.3222965002059937, + "step": 1110 + }, + { + "epoch": 0.58, + "learning_rate": 4.4855612927902083e-07, + "logits/chosen": -2.8727335929870605, + "logits/rejected": -2.8023390769958496, + "logps/chosen": -325.6292419433594, + "logps/rejected": -239.16049194335938, + "loss": 0.7095, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.41461220383644104, + "rewards/margins": 1.4964089393615723, + "rewards/rejected": -1.9110209941864014, + "step": 1120 + }, + { + "epoch": 0.58, + "learning_rate": 4.4759992350353795e-07, + "logits/chosen": -2.8231780529022217, + "logits/rejected": -2.908735513687134, + "logps/chosen": -304.6755676269531, + "logps/rejected": -277.4778747558594, + "loss": 0.4456, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8432878255844116, + "rewards/margins": 1.6819578409194946, + "rewards/rejected": -2.5252456665039062, + "step": 1130 + }, + { + "epoch": 0.59, + "learning_rate": 4.46643717728055e-07, + "logits/chosen": -2.7151193618774414, + "logits/rejected": -2.8067574501037598, + "logps/chosen": -280.8679504394531, + "logps/rejected": -273.8851318359375, + "loss": 0.5359, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0236613750457764, + "rewards/margins": 1.5441632270812988, + "rewards/rejected": -2.567824125289917, + "step": 1140 + }, + { + "epoch": 0.59, + "learning_rate": 4.4568751195257213e-07, + "logits/chosen": -2.776689052581787, + "logits/rejected": -2.8435564041137695, + "logps/chosen": -241.6661376953125, + "logps/rejected": -214.4073944091797, + "loss": 0.5624, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6465439796447754, + "rewards/margins": 1.3678399324417114, + "rewards/rejected": -2.0143837928771973, + "step": 1150 + }, + { + "epoch": 0.6, + "learning_rate": 4.447313061770893e-07, + "logits/chosen": -2.7138195037841797, + "logits/rejected": -2.585179328918457, + "logps/chosen": -335.81146240234375, + "logps/rejected": -350.88385009765625, + "loss": 0.54, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.47935476899147034, + "rewards/margins": 2.091104030609131, + "rewards/rejected": -2.5704588890075684, + "step": 1160 + }, + { + "epoch": 0.6, + "learning_rate": 4.437751004016064e-07, + "logits/chosen": -2.958820104598999, + "logits/rejected": -2.9421515464782715, + "logps/chosen": -305.28839111328125, + "logps/rejected": -262.6142883300781, + "loss": 0.5226, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1983194649219513, + "rewards/margins": 1.0410559177398682, + "rewards/rejected": -1.239375352859497, + "step": 1170 + }, + { + "epoch": 0.61, + "learning_rate": 4.4281889462612353e-07, + "logits/chosen": -2.9157230854034424, + "logits/rejected": -2.9529147148132324, + "logps/chosen": -262.28411865234375, + "logps/rejected": -205.23324584960938, + "loss": 0.4879, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35369348526000977, + "rewards/margins": 1.1171058416366577, + "rewards/rejected": -1.4707993268966675, + "step": 1180 + }, + { + "epoch": 0.61, + "learning_rate": 4.4186268885064064e-07, + "logits/chosen": -3.012878894805908, + "logits/rejected": -2.9825873374938965, + "logps/chosen": -287.4079895019531, + "logps/rejected": -257.97772216796875, + "loss": 0.5481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8515245318412781, + "rewards/margins": 0.8873499035835266, + "rewards/rejected": -1.7388744354248047, + "step": 1190 + }, + { + "epoch": 0.62, + "learning_rate": 4.4090648307515776e-07, + "logits/chosen": -2.9880738258361816, + "logits/rejected": -2.9730162620544434, + "logps/chosen": -224.86508178710938, + "logps/rejected": -165.42733764648438, + "loss": 0.5837, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.528744101524353, + "rewards/margins": 0.9531749486923218, + "rewards/rejected": -1.4819190502166748, + "step": 1200 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.775576591491699, + "eval_logits/rejected": -2.7643439769744873, + "eval_logps/chosen": -258.015380859375, + "eval_logps/rejected": -276.55419921875, + "eval_loss": 0.5348690152168274, + "eval_rewards/accuracies": 0.765625, + "eval_rewards/chosen": -0.37802520394325256, + "eval_rewards/margins": 1.5804035663604736, + "eval_rewards/rejected": -1.9584287405014038, + "eval_runtime": 62.7794, + "eval_samples_per_second": 15.929, + "eval_steps_per_second": 0.255, + "step": 1200 + }, + { + "epoch": 0.62, + "learning_rate": 4.399502772996749e-07, + "logits/chosen": -3.065775156021118, + "logits/rejected": -2.972374677658081, + "logps/chosen": -329.2245178222656, + "logps/rejected": -304.1506042480469, + "loss": 0.4827, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5635257363319397, + "rewards/margins": 1.3332871198654175, + "rewards/rejected": -1.8968127965927124, + "step": 1210 + }, + { + "epoch": 0.63, + "learning_rate": 4.38994071524192e-07, + "logits/chosen": -3.030421733856201, + "logits/rejected": -3.05527925491333, + "logps/chosen": -258.029541015625, + "logps/rejected": -280.21673583984375, + "loss": 0.5735, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8231660723686218, + "rewards/margins": 1.493502140045166, + "rewards/rejected": -2.3166680335998535, + "step": 1220 + }, + { + "epoch": 0.64, + "learning_rate": 4.380378657487091e-07, + "logits/chosen": -3.074777364730835, + "logits/rejected": -3.0200257301330566, + "logps/chosen": -330.8500061035156, + "logps/rejected": -371.5386047363281, + "loss": 0.5982, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.35177409648895264, + "rewards/margins": 1.7098945379257202, + "rewards/rejected": -2.061668634414673, + "step": 1230 + }, + { + "epoch": 0.64, + "learning_rate": 4.370816599732262e-07, + "logits/chosen": -2.999420166015625, + "logits/rejected": -2.9334309101104736, + "logps/chosen": -296.1866760253906, + "logps/rejected": -236.72341918945312, + "loss": 0.5337, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.419023871421814, + "rewards/margins": 1.311092734336853, + "rewards/rejected": -2.730116367340088, + "step": 1240 + }, + { + "epoch": 0.65, + "learning_rate": 4.3612545419774334e-07, + "logits/chosen": -2.883204460144043, + "logits/rejected": -2.8073534965515137, + "logps/chosen": -257.63519287109375, + "logps/rejected": -301.64227294921875, + "loss": 0.5928, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9009000658988953, + "rewards/margins": 1.269942045211792, + "rewards/rejected": -2.170842409133911, + "step": 1250 + }, + { + "epoch": 0.65, + "learning_rate": 4.3516924842226045e-07, + "logits/chosen": -2.920656681060791, + "logits/rejected": -2.91890025138855, + "logps/chosen": -322.2228698730469, + "logps/rejected": -296.6404113769531, + "loss": 0.629, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9084477424621582, + "rewards/margins": 1.9311565160751343, + "rewards/rejected": -2.839603900909424, + "step": 1260 + }, + { + "epoch": 0.66, + "learning_rate": 4.3421304264677757e-07, + "logits/chosen": -2.7664384841918945, + "logits/rejected": -2.7587597370147705, + "logps/chosen": -214.4353485107422, + "logps/rejected": -212.84805297851562, + "loss": 0.476, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.016667127609253, + "rewards/margins": 1.7192564010620117, + "rewards/rejected": -2.7359237670898438, + "step": 1270 + }, + { + "epoch": 0.66, + "learning_rate": 4.332568368712947e-07, + "logits/chosen": -2.933134078979492, + "logits/rejected": -2.877431869506836, + "logps/chosen": -361.19573974609375, + "logps/rejected": -279.9434814453125, + "loss": 0.5784, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5590813159942627, + "rewards/margins": 0.4831056594848633, + "rewards/rejected": -2.042186975479126, + "step": 1280 + }, + { + "epoch": 0.67, + "learning_rate": 4.323006310958118e-07, + "logits/chosen": -2.904773235321045, + "logits/rejected": -2.985483169555664, + "logps/chosen": -411.599853515625, + "logps/rejected": -315.9092712402344, + "loss": 0.4975, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.908369243144989, + "rewards/margins": 1.5060293674468994, + "rewards/rejected": -2.414398670196533, + "step": 1290 + }, + { + "epoch": 0.67, + "learning_rate": 4.313444253203289e-07, + "logits/chosen": -2.9397823810577393, + "logits/rejected": -2.9090006351470947, + "logps/chosen": -281.9468078613281, + "logps/rejected": -261.4234619140625, + "loss": 0.5214, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9744187593460083, + "rewards/margins": 1.7388330698013306, + "rewards/rejected": -2.7132515907287598, + "step": 1300 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -2.711259603500366, + "eval_logits/rejected": -2.698620080947876, + "eval_logps/chosen": -264.2903137207031, + "eval_logps/rejected": -279.27606201171875, + "eval_loss": 0.5732331871986389, + "eval_rewards/accuracies": 0.765625, + "eval_rewards/chosen": -1.0055204629898071, + "eval_rewards/margins": 1.2250933647155762, + "eval_rewards/rejected": -2.2306137084960938, + "eval_runtime": 57.0185, + "eval_samples_per_second": 17.538, + "eval_steps_per_second": 0.281, + "step": 1300 + }, + { + "epoch": 0.68, + "learning_rate": 4.3038821954484603e-07, + "logits/chosen": -2.8764219284057617, + "logits/rejected": -2.7695984840393066, + "logps/chosen": -238.80453491210938, + "logps/rejected": -252.7805633544922, + "loss": 0.4686, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3183587789535522, + "rewards/margins": 0.5890123248100281, + "rewards/rejected": -1.907371163368225, + "step": 1310 + }, + { + "epoch": 0.68, + "learning_rate": 4.2943201376936315e-07, + "logits/chosen": -2.8463966846466064, + "logits/rejected": -2.850677967071533, + "logps/chosen": -291.9525146484375, + "logps/rejected": -315.1170349121094, + "loss": 0.5393, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8451521992683411, + "rewards/margins": 1.4508628845214844, + "rewards/rejected": -2.2960150241851807, + "step": 1320 + }, + { + "epoch": 0.69, + "learning_rate": 4.2847580799388026e-07, + "logits/chosen": -2.8076231479644775, + "logits/rejected": -2.7472128868103027, + "logps/chosen": -258.0926513671875, + "logps/rejected": -188.78359985351562, + "loss": 0.5812, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4854873418807983, + "rewards/margins": 0.9221324920654297, + "rewards/rejected": -2.4076199531555176, + "step": 1330 + }, + { + "epoch": 0.69, + "learning_rate": 4.275196022183974e-07, + "logits/chosen": -2.7642879486083984, + "logits/rejected": -2.812042474746704, + "logps/chosen": -319.1858825683594, + "logps/rejected": -246.2572784423828, + "loss": 0.553, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.092181921005249, + "rewards/margins": 0.8022899627685547, + "rewards/rejected": -1.8944717645645142, + "step": 1340 + }, + { + "epoch": 0.7, + "learning_rate": 4.265633964429145e-07, + "logits/chosen": -2.812278985977173, + "logits/rejected": -2.761359691619873, + "logps/chosen": -342.2608337402344, + "logps/rejected": -224.8918914794922, + "loss": 0.5775, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.914333701133728, + "rewards/margins": 1.487335205078125, + "rewards/rejected": -2.4016687870025635, + "step": 1350 + }, + { + "epoch": 0.7, + "learning_rate": 4.256071906674316e-07, + "logits/chosen": -2.9929111003875732, + "logits/rejected": -2.94170880317688, + "logps/chosen": -379.921875, + "logps/rejected": -368.10357666015625, + "loss": 0.652, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6550450325012207, + "rewards/margins": 0.9407709240913391, + "rewards/rejected": -1.595815896987915, + "step": 1360 + }, + { + "epoch": 0.71, + "learning_rate": 4.246509848919487e-07, + "logits/chosen": -2.912461757659912, + "logits/rejected": -2.9404354095458984, + "logps/chosen": -348.6622009277344, + "logps/rejected": -248.0426788330078, + "loss": 0.5636, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7293527722358704, + "rewards/margins": 1.5709936618804932, + "rewards/rejected": -2.3003463745117188, + "step": 1370 + }, + { + "epoch": 0.71, + "learning_rate": 4.2369477911646584e-07, + "logits/chosen": -2.829761505126953, + "logits/rejected": -2.7305688858032227, + "logps/chosen": -295.06781005859375, + "logps/rejected": -240.6433563232422, + "loss": 0.5567, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.69425368309021, + "rewards/margins": 1.3369704484939575, + "rewards/rejected": -2.031224012374878, + "step": 1380 + }, + { + "epoch": 0.72, + "learning_rate": 4.2273857334098296e-07, + "logits/chosen": -2.7433788776397705, + "logits/rejected": -2.67673921585083, + "logps/chosen": -289.397216796875, + "logps/rejected": -243.04833984375, + "loss": 0.6061, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7310119867324829, + "rewards/margins": 0.585370659828186, + "rewards/rejected": -1.316382646560669, + "step": 1390 + }, + { + "epoch": 0.72, + "learning_rate": 4.2178236756550007e-07, + "logits/chosen": -2.923424243927002, + "logits/rejected": -2.912429094314575, + "logps/chosen": -198.62017822265625, + "logps/rejected": -277.8341979980469, + "loss": 0.6914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4593328535556793, + "rewards/margins": 0.9467372894287109, + "rewards/rejected": -1.4060701131820679, + "step": 1400 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -2.727492094039917, + "eval_logits/rejected": -2.7166121006011963, + "eval_logps/chosen": -261.146728515625, + "eval_logps/rejected": -278.7448425292969, + "eval_loss": 0.5136687159538269, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -0.6911616921424866, + "eval_rewards/margins": 1.4863313436508179, + "eval_rewards/rejected": -2.177493095397949, + "eval_runtime": 58.5256, + "eval_samples_per_second": 17.087, + "eval_steps_per_second": 0.273, + "step": 1400 + }, + { + "epoch": 0.73, + "learning_rate": 4.208261617900172e-07, + "logits/chosen": -2.9004664421081543, + "logits/rejected": -2.991079807281494, + "logps/chosen": -200.07357788085938, + "logps/rejected": -212.9990234375, + "loss": 0.4996, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8260477185249329, + "rewards/margins": 1.3000409603118896, + "rewards/rejected": -2.1260886192321777, + "step": 1410 + }, + { + "epoch": 0.73, + "learning_rate": 4.198699560145343e-07, + "logits/chosen": -2.750919818878174, + "logits/rejected": -2.563699245452881, + "logps/chosen": -229.60348510742188, + "logps/rejected": -250.4010467529297, + "loss": 0.6298, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8068568110466003, + "rewards/margins": 0.5274587869644165, + "rewards/rejected": -1.334315538406372, + "step": 1420 + }, + { + "epoch": 0.74, + "learning_rate": 4.189137502390514e-07, + "logits/chosen": -2.871040105819702, + "logits/rejected": -2.859773635864258, + "logps/chosen": -255.4622344970703, + "logps/rejected": -367.54644775390625, + "loss": 0.6702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7843345999717712, + "rewards/margins": 1.0513083934783936, + "rewards/rejected": -1.8356430530548096, + "step": 1430 + }, + { + "epoch": 0.74, + "learning_rate": 4.179575444635686e-07, + "logits/chosen": -2.797947406768799, + "logits/rejected": -2.768245220184326, + "logps/chosen": -297.70465087890625, + "logps/rejected": -303.483154296875, + "loss": 0.5911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.02579927444458, + "rewards/margins": 1.1320759057998657, + "rewards/rejected": -2.1578750610351562, + "step": 1440 + }, + { + "epoch": 0.75, + "learning_rate": 4.170013386880857e-07, + "logits/chosen": -2.842421531677246, + "logits/rejected": -2.816070318222046, + "logps/chosen": -286.8984069824219, + "logps/rejected": -256.8731689453125, + "loss": 0.4926, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8610552549362183, + "rewards/margins": 0.9260069727897644, + "rewards/rejected": -1.7870622873306274, + "step": 1450 + }, + { + "epoch": 0.75, + "learning_rate": 4.1604513291260277e-07, + "logits/chosen": -2.814866542816162, + "logits/rejected": -2.7706210613250732, + "logps/chosen": -273.64111328125, + "logps/rejected": -259.51885986328125, + "loss": 0.5227, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5814541578292847, + "rewards/margins": 2.4891767501831055, + "rewards/rejected": -3.0706310272216797, + "step": 1460 + }, + { + "epoch": 0.76, + "learning_rate": 4.150889271371199e-07, + "logits/chosen": -2.7491848468780518, + "logits/rejected": -2.800107002258301, + "logps/chosen": -268.9418029785156, + "logps/rejected": -296.1412658691406, + "loss": 0.59, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.48690223693847656, + "rewards/margins": 1.39999520778656, + "rewards/rejected": -1.886897325515747, + "step": 1470 + }, + { + "epoch": 0.76, + "learning_rate": 4.14132721361637e-07, + "logits/chosen": -2.7790863513946533, + "logits/rejected": -2.756493330001831, + "logps/chosen": -376.4327392578125, + "logps/rejected": -220.66128540039062, + "loss": 0.4328, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6699414849281311, + "rewards/margins": 2.2528209686279297, + "rewards/rejected": -2.922762632369995, + "step": 1480 + }, + { + "epoch": 0.77, + "learning_rate": 4.131765155861541e-07, + "logits/chosen": -2.732978343963623, + "logits/rejected": -2.712939739227295, + "logps/chosen": -216.78231811523438, + "logps/rejected": -187.97975158691406, + "loss": 0.5317, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3067319989204407, + "rewards/margins": 1.883384346961975, + "rewards/rejected": -2.1901164054870605, + "step": 1490 + }, + { + "epoch": 0.77, + "learning_rate": 4.1222030981067123e-07, + "logits/chosen": -2.7365033626556396, + "logits/rejected": -2.709888458251953, + "logps/chosen": -284.1839294433594, + "logps/rejected": -285.10064697265625, + "loss": 0.4655, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2894710302352905, + "rewards/margins": 0.8915459513664246, + "rewards/rejected": -2.1810169219970703, + "step": 1500 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.6837804317474365, + "eval_logits/rejected": -2.6651253700256348, + "eval_logps/chosen": -262.2220153808594, + "eval_logps/rejected": -279.8998718261719, + "eval_loss": 0.5090023875236511, + "eval_rewards/accuracies": 0.703125, + "eval_rewards/chosen": -0.7986923456192017, + "eval_rewards/margins": 1.494301438331604, + "eval_rewards/rejected": -2.2929937839508057, + "eval_runtime": 59.1398, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 0.271, + "step": 1500 + }, + { + "epoch": 0.78, + "learning_rate": 4.1126410403518835e-07, + "logits/chosen": -2.774035930633545, + "logits/rejected": -2.5919785499572754, + "logps/chosen": -263.84185791015625, + "logps/rejected": -284.3238220214844, + "loss": 0.506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6006399393081665, + "rewards/margins": 2.4321160316467285, + "rewards/rejected": -3.0327563285827637, + "step": 1510 + }, + { + "epoch": 0.78, + "learning_rate": 4.1030789825970546e-07, + "logits/chosen": -2.834711790084839, + "logits/rejected": -2.8974971771240234, + "logps/chosen": -314.38604736328125, + "logps/rejected": -352.1858825683594, + "loss": 0.5863, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.9283245205879211, + "rewards/margins": 0.4295298457145691, + "rewards/rejected": -1.3578544855117798, + "step": 1520 + }, + { + "epoch": 0.79, + "learning_rate": 4.093516924842226e-07, + "logits/chosen": -2.6712985038757324, + "logits/rejected": -2.6710007190704346, + "logps/chosen": -241.9701690673828, + "logps/rejected": -220.11502075195312, + "loss": 0.4433, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21889445185661316, + "rewards/margins": 1.4537973403930664, + "rewards/rejected": -1.6726917028427124, + "step": 1530 + }, + { + "epoch": 0.8, + "learning_rate": 4.083954867087397e-07, + "logits/chosen": -2.9339497089385986, + "logits/rejected": -2.9125781059265137, + "logps/chosen": -197.97679138183594, + "logps/rejected": -202.1653289794922, + "loss": 0.5284, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.0227124691009521, + "rewards/margins": 0.7052000761032104, + "rewards/rejected": -1.7279125452041626, + "step": 1540 + }, + { + "epoch": 0.8, + "learning_rate": 4.074392809332568e-07, + "logits/chosen": -2.841710329055786, + "logits/rejected": -2.8297677040100098, + "logps/chosen": -316.0207824707031, + "logps/rejected": -229.83837890625, + "loss": 0.513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8295713663101196, + "rewards/margins": 1.6421802043914795, + "rewards/rejected": -2.4717514514923096, + "step": 1550 + }, + { + "epoch": 0.81, + "learning_rate": 4.064830751577739e-07, + "logits/chosen": -2.889648914337158, + "logits/rejected": -2.767516613006592, + "logps/chosen": -185.7052001953125, + "logps/rejected": -177.06546020507812, + "loss": 0.4782, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2799871563911438, + "rewards/margins": 1.6918373107910156, + "rewards/rejected": -1.971824288368225, + "step": 1560 + }, + { + "epoch": 0.81, + "learning_rate": 4.0552686938229104e-07, + "logits/chosen": -2.913878917694092, + "logits/rejected": -2.7892653942108154, + "logps/chosen": -242.6092071533203, + "logps/rejected": -218.4198760986328, + "loss": 0.4485, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5770751237869263, + "rewards/margins": 1.0650291442871094, + "rewards/rejected": -1.6421045064926147, + "step": 1570 + }, + { + "epoch": 0.82, + "learning_rate": 4.045706636068082e-07, + "logits/chosen": -2.769942045211792, + "logits/rejected": -2.8180408477783203, + "logps/chosen": -178.39805603027344, + "logps/rejected": -233.78713989257812, + "loss": 0.4683, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6862732768058777, + "rewards/margins": 1.50155770778656, + "rewards/rejected": -2.187831163406372, + "step": 1580 + }, + { + "epoch": 0.82, + "learning_rate": 4.036144578313253e-07, + "logits/chosen": -2.832733392715454, + "logits/rejected": -2.9456982612609863, + "logps/chosen": -258.8636779785156, + "logps/rejected": -223.64968872070312, + "loss": 0.5346, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5500979423522949, + "rewards/margins": 1.3681669235229492, + "rewards/rejected": -1.9182647466659546, + "step": 1590 + }, + { + "epoch": 0.83, + "learning_rate": 4.0265825205584244e-07, + "logits/chosen": -2.820535182952881, + "logits/rejected": -2.8729701042175293, + "logps/chosen": -261.64056396484375, + "logps/rejected": -265.8938293457031, + "loss": 0.5731, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2627449929714203, + "rewards/margins": 1.6428673267364502, + "rewards/rejected": -1.9056123495101929, + "step": 1600 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.6727685928344727, + "eval_logits/rejected": -2.654268264770508, + "eval_logps/chosen": -262.48760986328125, + "eval_logps/rejected": -280.4902038574219, + "eval_loss": 0.5312163829803467, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -0.8252508044242859, + "eval_rewards/margins": 1.5267785787582397, + "eval_rewards/rejected": -2.352029323577881, + "eval_runtime": 58.1435, + "eval_samples_per_second": 17.199, + "eval_steps_per_second": 0.275, + "step": 1600 + }, + { + "epoch": 0.83, + "learning_rate": 4.0170204628035956e-07, + "logits/chosen": -2.7514543533325195, + "logits/rejected": -2.8077378273010254, + "logps/chosen": -211.7623748779297, + "logps/rejected": -252.843994140625, + "loss": 0.494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7454391121864319, + "rewards/margins": 1.7308275699615479, + "rewards/rejected": -2.476266384124756, + "step": 1610 + }, + { + "epoch": 0.84, + "learning_rate": 4.007458405048766e-07, + "logits/chosen": -2.857224225997925, + "logits/rejected": -2.839128017425537, + "logps/chosen": -282.80975341796875, + "logps/rejected": -277.6203308105469, + "loss": 0.4801, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9825541377067566, + "rewards/margins": 1.5490392446517944, + "rewards/rejected": -2.5315933227539062, + "step": 1620 + }, + { + "epoch": 0.84, + "learning_rate": 3.9978963472939373e-07, + "logits/chosen": -2.838963031768799, + "logits/rejected": -2.808168411254883, + "logps/chosen": -291.43280029296875, + "logps/rejected": -260.65203857421875, + "loss": 0.48, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7560056447982788, + "rewards/margins": 1.1872189044952393, + "rewards/rejected": -1.9432246685028076, + "step": 1630 + }, + { + "epoch": 0.85, + "learning_rate": 3.9883342895391085e-07, + "logits/chosen": -2.815406084060669, + "logits/rejected": -2.7723686695098877, + "logps/chosen": -270.00689697265625, + "logps/rejected": -229.3192901611328, + "loss": 0.546, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.42303770780563354, + "rewards/margins": 2.0215001106262207, + "rewards/rejected": -2.4445383548736572, + "step": 1640 + }, + { + "epoch": 0.85, + "learning_rate": 3.9787722317842796e-07, + "logits/chosen": -2.89304256439209, + "logits/rejected": -2.849522113800049, + "logps/chosen": -351.61968994140625, + "logps/rejected": -226.35800170898438, + "loss": 0.5355, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.039799273014068604, + "rewards/margins": 1.9888496398925781, + "rewards/rejected": -1.9490505456924438, + "step": 1650 + }, + { + "epoch": 0.86, + "learning_rate": 3.969210174029451e-07, + "logits/chosen": -2.7271950244903564, + "logits/rejected": -2.7782604694366455, + "logps/chosen": -182.22679138183594, + "logps/rejected": -258.85784912109375, + "loss": 0.5878, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.30569443106651306, + "rewards/margins": 2.009462833404541, + "rewards/rejected": -2.315157175064087, + "step": 1660 + }, + { + "epoch": 0.86, + "learning_rate": 3.959648116274622e-07, + "logits/chosen": -2.7675626277923584, + "logits/rejected": -2.7918038368225098, + "logps/chosen": -258.5990295410156, + "logps/rejected": -230.567138671875, + "loss": 0.6044, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6162198781967163, + "rewards/margins": 1.2834047079086304, + "rewards/rejected": -1.8996245861053467, + "step": 1670 + }, + { + "epoch": 0.87, + "learning_rate": 3.950086058519793e-07, + "logits/chosen": -2.922581195831299, + "logits/rejected": -2.8593482971191406, + "logps/chosen": -224.3607940673828, + "logps/rejected": -256.91510009765625, + "loss": 0.5717, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4705128073692322, + "rewards/margins": 1.2131963968276978, + "rewards/rejected": -1.6837093830108643, + "step": 1680 + }, + { + "epoch": 0.87, + "learning_rate": 3.9405240007649643e-07, + "logits/chosen": -2.9224143028259277, + "logits/rejected": -2.942783832550049, + "logps/chosen": -216.39816284179688, + "logps/rejected": -272.4553527832031, + "loss": 0.5066, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5551620125770569, + "rewards/margins": 1.4193694591522217, + "rewards/rejected": -1.9745315313339233, + "step": 1690 + }, + { + "epoch": 0.88, + "learning_rate": 3.9309619430101354e-07, + "logits/chosen": -2.8787567615509033, + "logits/rejected": -2.8499011993408203, + "logps/chosen": -241.4861297607422, + "logps/rejected": -210.9620361328125, + "loss": 0.5233, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7351746559143066, + "rewards/margins": 1.017896294593811, + "rewards/rejected": -1.7530708312988281, + "step": 1700 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -2.709693431854248, + "eval_logits/rejected": -2.686978340148926, + "eval_logps/chosen": -258.8084411621094, + "eval_logps/rejected": -277.92047119140625, + "eval_loss": 0.5205972790718079, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -0.4573337435722351, + "eval_rewards/margins": 1.637721300125122, + "eval_rewards/rejected": -2.095055103302002, + "eval_runtime": 55.0835, + "eval_samples_per_second": 18.154, + "eval_steps_per_second": 0.29, + "step": 1700 + }, + { + "epoch": 0.88, + "learning_rate": 3.9213998852553066e-07, + "logits/chosen": -2.900834798812866, + "logits/rejected": -2.703029155731201, + "logps/chosen": -338.3115539550781, + "logps/rejected": -326.6482238769531, + "loss": 0.6064, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.00930917263031, + "rewards/margins": 1.1334011554718018, + "rewards/rejected": -2.1427102088928223, + "step": 1710 + }, + { + "epoch": 0.89, + "learning_rate": 3.9118378275004783e-07, + "logits/chosen": -2.888641119003296, + "logits/rejected": -2.817422389984131, + "logps/chosen": -339.2339172363281, + "logps/rejected": -273.8936462402344, + "loss": 0.5537, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8813110589981079, + "rewards/margins": 1.3287475109100342, + "rewards/rejected": -2.2100586891174316, + "step": 1720 + }, + { + "epoch": 0.89, + "learning_rate": 3.9022757697456494e-07, + "logits/chosen": -2.739957094192505, + "logits/rejected": -2.78080677986145, + "logps/chosen": -356.21844482421875, + "logps/rejected": -331.55096435546875, + "loss": 0.5099, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5665701627731323, + "rewards/margins": 1.0891746282577515, + "rewards/rejected": -1.6557449102401733, + "step": 1730 + }, + { + "epoch": 0.9, + "learning_rate": 3.8927137119908206e-07, + "logits/chosen": -2.9097769260406494, + "logits/rejected": -2.848907947540283, + "logps/chosen": -305.66607666015625, + "logps/rejected": -211.9135284423828, + "loss": 0.487, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.30798807740211487, + "rewards/margins": 2.1856961250305176, + "rewards/rejected": -2.4936842918395996, + "step": 1740 + }, + { + "epoch": 0.9, + "learning_rate": 3.883151654235992e-07, + "logits/chosen": -2.831984281539917, + "logits/rejected": -2.8497231006622314, + "logps/chosen": -256.02667236328125, + "logps/rejected": -253.0034942626953, + "loss": 0.5203, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.10425261408090591, + "rewards/margins": 1.9909296035766602, + "rewards/rejected": -2.095182180404663, + "step": 1750 + }, + { + "epoch": 0.91, + "learning_rate": 3.873589596481163e-07, + "logits/chosen": -2.765052080154419, + "logits/rejected": -2.8637542724609375, + "logps/chosen": -267.7926025390625, + "logps/rejected": -239.08718872070312, + "loss": 0.5506, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7126263976097107, + "rewards/margins": 0.9376744031906128, + "rewards/rejected": -1.6503007411956787, + "step": 1760 + }, + { + "epoch": 0.91, + "learning_rate": 3.864027538726334e-07, + "logits/chosen": -2.8235788345336914, + "logits/rejected": -2.7703769207000732, + "logps/chosen": -227.66281127929688, + "logps/rejected": -227.4187469482422, + "loss": 0.5707, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.47516727447509766, + "rewards/margins": 1.8380186557769775, + "rewards/rejected": -2.313185691833496, + "step": 1770 + }, + { + "epoch": 0.92, + "learning_rate": 3.8544654809715047e-07, + "logits/chosen": -2.840291976928711, + "logits/rejected": -2.889273166656494, + "logps/chosen": -289.7173156738281, + "logps/rejected": -289.07391357421875, + "loss": 0.5463, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0754880905151367, + "rewards/margins": 1.31435227394104, + "rewards/rejected": -2.3898403644561768, + "step": 1780 + }, + { + "epoch": 0.92, + "learning_rate": 3.844903423216676e-07, + "logits/chosen": -2.862814426422119, + "logits/rejected": -2.8419394493103027, + "logps/chosen": -256.5997619628906, + "logps/rejected": -221.2139434814453, + "loss": 0.5288, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6081517338752747, + "rewards/margins": 0.817557156085968, + "rewards/rejected": -1.4257088899612427, + "step": 1790 + }, + { + "epoch": 0.93, + "learning_rate": 3.835341365461847e-07, + "logits/chosen": -2.836991786956787, + "logits/rejected": -2.78855562210083, + "logps/chosen": -302.5685119628906, + "logps/rejected": -237.11209106445312, + "loss": 0.5593, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.189387708902359, + "rewards/margins": 1.4565317630767822, + "rewards/rejected": -1.6459195613861084, + "step": 1800 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.651865005493164, + "eval_logits/rejected": -2.6221344470977783, + "eval_logps/chosen": -259.7433166503906, + "eval_logps/rejected": -278.97027587890625, + "eval_loss": 0.523062527179718, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -0.5508205890655518, + "eval_rewards/margins": 1.649214267730713, + "eval_rewards/rejected": -2.2000348567962646, + "eval_runtime": 52.7667, + "eval_samples_per_second": 18.951, + "eval_steps_per_second": 0.303, + "step": 1800 + }, + { + "epoch": 0.93, + "learning_rate": 3.825779307707018e-07, + "logits/chosen": -2.805868625640869, + "logits/rejected": -2.8683719635009766, + "logps/chosen": -165.62680053710938, + "logps/rejected": -188.5421600341797, + "loss": 0.5336, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4086759090423584, + "rewards/margins": 1.5789250135421753, + "rewards/rejected": -1.9876010417938232, + "step": 1810 + }, + { + "epoch": 0.94, + "learning_rate": 3.8162172499521893e-07, + "logits/chosen": -2.823812961578369, + "logits/rejected": -2.8563215732574463, + "logps/chosen": -220.8478546142578, + "logps/rejected": -212.088623046875, + "loss": 0.5532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7714720368385315, + "rewards/margins": 1.1308372020721436, + "rewards/rejected": -1.9023091793060303, + "step": 1820 + }, + { + "epoch": 0.94, + "learning_rate": 3.8066551921973605e-07, + "logits/chosen": -2.772951126098633, + "logits/rejected": -2.741703510284424, + "logps/chosen": -282.5356750488281, + "logps/rejected": -204.34359741210938, + "loss": 0.4889, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4399307668209076, + "rewards/margins": 1.7154382467269897, + "rewards/rejected": -2.1553690433502197, + "step": 1830 + }, + { + "epoch": 0.95, + "learning_rate": 3.7970931344425316e-07, + "logits/chosen": -2.822625160217285, + "logits/rejected": -2.813814640045166, + "logps/chosen": -298.0512390136719, + "logps/rejected": -247.30172729492188, + "loss": 0.4864, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5513595938682556, + "rewards/margins": 0.6814876198768616, + "rewards/rejected": -1.2328474521636963, + "step": 1840 + }, + { + "epoch": 0.96, + "learning_rate": 3.787531076687703e-07, + "logits/chosen": -2.7317397594451904, + "logits/rejected": -2.7448277473449707, + "logps/chosen": -269.85760498046875, + "logps/rejected": -223.9963836669922, + "loss": 0.5273, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.41241344809532166, + "rewards/margins": 2.08837890625, + "rewards/rejected": -2.5007922649383545, + "step": 1850 + }, + { + "epoch": 0.96, + "learning_rate": 3.7779690189328745e-07, + "logits/chosen": -2.812678813934326, + "logits/rejected": -2.8333404064178467, + "logps/chosen": -280.35247802734375, + "logps/rejected": -255.755126953125, + "loss": 0.4633, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.533105194568634, + "rewards/margins": 1.1613143682479858, + "rewards/rejected": -1.694419503211975, + "step": 1860 + }, + { + "epoch": 0.97, + "learning_rate": 3.7684069611780456e-07, + "logits/chosen": -2.824018716812134, + "logits/rejected": -2.789066791534424, + "logps/chosen": -230.8186798095703, + "logps/rejected": -197.3396453857422, + "loss": 0.565, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8623602986335754, + "rewards/margins": 0.8971187472343445, + "rewards/rejected": -1.7594791650772095, + "step": 1870 + }, + { + "epoch": 0.97, + "learning_rate": 3.758844903423217e-07, + "logits/chosen": -2.731091260910034, + "logits/rejected": -2.8013782501220703, + "logps/chosen": -261.1441650390625, + "logps/rejected": -285.99298095703125, + "loss": 0.5784, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0055458545684814, + "rewards/margins": 1.3556736707687378, + "rewards/rejected": -2.3612194061279297, + "step": 1880 + }, + { + "epoch": 0.98, + "learning_rate": 3.749282845668388e-07, + "logits/chosen": -2.979485034942627, + "logits/rejected": -2.8463826179504395, + "logps/chosen": -279.7747497558594, + "logps/rejected": -281.71881103515625, + "loss": 0.5474, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5681439638137817, + "rewards/margins": 1.333145022392273, + "rewards/rejected": -1.9012889862060547, + "step": 1890 + }, + { + "epoch": 0.98, + "learning_rate": 3.739720787913559e-07, + "logits/chosen": -2.8072428703308105, + "logits/rejected": -2.8444907665252686, + "logps/chosen": -318.6991882324219, + "logps/rejected": -286.9081726074219, + "loss": 0.4967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.501719057559967, + "rewards/margins": 1.7894643545150757, + "rewards/rejected": -2.2911829948425293, + "step": 1900 + }, + { + "epoch": 0.98, + "eval_logits/chosen": -2.6878409385681152, + "eval_logits/rejected": -2.65635347366333, + "eval_logps/chosen": -259.57489013671875, + "eval_logps/rejected": -276.53948974609375, + "eval_loss": 0.528998613357544, + "eval_rewards/accuracies": 0.828125, + "eval_rewards/chosen": -0.5339791178703308, + "eval_rewards/margins": 1.4229780435562134, + "eval_rewards/rejected": -1.9569573402404785, + "eval_runtime": 58.1447, + "eval_samples_per_second": 17.198, + "eval_steps_per_second": 0.275, + "step": 1900 + }, + { + "epoch": 0.99, + "learning_rate": 3.73015873015873e-07, + "logits/chosen": -2.8797740936279297, + "logits/rejected": -2.8204915523529053, + "logps/chosen": -275.71417236328125, + "logps/rejected": -218.75216674804688, + "loss": 0.497, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2984248995780945, + "rewards/margins": 0.8872405886650085, + "rewards/rejected": -1.185665488243103, + "step": 1910 + }, + { + "epoch": 0.99, + "learning_rate": 3.7205966724039014e-07, + "logits/chosen": -2.728538990020752, + "logits/rejected": -2.7202653884887695, + "logps/chosen": -326.10626220703125, + "logps/rejected": -258.46539306640625, + "loss": 0.4882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4980931878089905, + "rewards/margins": 1.183232069015503, + "rewards/rejected": -1.6813253164291382, + "step": 1920 + }, + { + "epoch": 1.0, + "learning_rate": 3.711034614649072e-07, + "logits/chosen": -2.7934298515319824, + "logits/rejected": -2.7393717765808105, + "logps/chosen": -288.08892822265625, + "logps/rejected": -230.5961151123047, + "loss": 0.4772, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.8787330389022827, + "rewards/margins": 1.967394232749939, + "rewards/rejected": -2.8461270332336426, + "step": 1930 + }, + { + "epoch": 1.0, + "learning_rate": 3.701472556894243e-07, + "logits/chosen": -2.742077350616455, + "logits/rejected": -2.6756367683410645, + "logps/chosen": -230.4021759033203, + "logps/rejected": -311.9582214355469, + "loss": 0.3813, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0876777321100235, + "rewards/margins": 2.651371479034424, + "rewards/rejected": -2.7390494346618652, + "step": 1940 + }, + { + "epoch": 1.01, + "learning_rate": 3.6919104991394144e-07, + "logits/chosen": -2.8869168758392334, + "logits/rejected": -2.870358943939209, + "logps/chosen": -180.28982543945312, + "logps/rejected": -222.55810546875, + "loss": 0.0952, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.38509073853492737, + "rewards/margins": 3.711843490600586, + "rewards/rejected": -3.3267529010772705, + "step": 1950 + }, + { + "epoch": 1.01, + "learning_rate": 3.6823484413845855e-07, + "logits/chosen": -2.7434253692626953, + "logits/rejected": -2.826244354248047, + "logps/chosen": -290.17999267578125, + "logps/rejected": -359.95318603515625, + "loss": 0.0795, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3888338804244995, + "rewards/margins": 6.838004112243652, + "rewards/rejected": -5.449170112609863, + "step": 1960 + }, + { + "epoch": 1.02, + "learning_rate": 3.6727863836297567e-07, + "logits/chosen": -2.761378765106201, + "logits/rejected": -2.8128793239593506, + "logps/chosen": -196.734619140625, + "logps/rejected": -248.90444946289062, + "loss": 0.0896, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7179908752441406, + "rewards/margins": 4.64093542098999, + "rewards/rejected": -3.9229445457458496, + "step": 1970 + }, + { + "epoch": 1.02, + "learning_rate": 3.663224325874928e-07, + "logits/chosen": -2.7781982421875, + "logits/rejected": -2.745850086212158, + "logps/chosen": -230.58413696289062, + "logps/rejected": -243.18405151367188, + "loss": 0.1075, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2712101638317108, + "rewards/margins": 3.9006011486053467, + "rewards/rejected": -4.171811103820801, + "step": 1980 + }, + { + "epoch": 1.03, + "learning_rate": 3.653662268120099e-07, + "logits/chosen": -2.69258975982666, + "logits/rejected": -2.718759775161743, + "logps/chosen": -263.413818359375, + "logps/rejected": -258.7752990722656, + "loss": 0.0765, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1363146305084229, + "rewards/margins": 5.461094856262207, + "rewards/rejected": -4.324779987335205, + "step": 1990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6441002103652707e-07, + "logits/chosen": -2.7689287662506104, + "logits/rejected": -2.7180933952331543, + "logps/chosen": -184.6995391845703, + "logps/rejected": -219.4838409423828, + "loss": 0.0921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8591575622558594, + "rewards/margins": 3.6370902061462402, + "rewards/rejected": -4.4962477684021, + "step": 2000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -2.634498119354248, + "eval_logits/rejected": -2.6040313243865967, + "eval_logps/chosen": -265.61114501953125, + "eval_logps/rejected": -288.5853576660156, + "eval_loss": 0.5368282198905945, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -1.1376045942306519, + "eval_rewards/margins": 2.023937463760376, + "eval_rewards/rejected": -3.1615419387817383, + "eval_runtime": 57.4706, + "eval_samples_per_second": 17.4, + "eval_steps_per_second": 0.278, + "step": 2000 + }, + { + "epoch": 1.04, + "learning_rate": 3.634538152610442e-07, + "logits/chosen": -2.779973268508911, + "logits/rejected": -2.83324933052063, + "logps/chosen": -228.81320190429688, + "logps/rejected": -310.46905517578125, + "loss": 0.0672, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3947059214115143, + "rewards/margins": 5.533167362213135, + "rewards/rejected": -5.138461112976074, + "step": 2010 + }, + { + "epoch": 1.04, + "learning_rate": 3.624976094855613e-07, + "logits/chosen": -2.7965312004089355, + "logits/rejected": -2.7170250415802, + "logps/chosen": -250.6485137939453, + "logps/rejected": -262.75152587890625, + "loss": 0.0705, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7664440870285034, + "rewards/margins": 7.152462959289551, + "rewards/rejected": -6.386018753051758, + "step": 2020 + }, + { + "epoch": 1.05, + "learning_rate": 3.615414037100784e-07, + "logits/chosen": -2.6802003383636475, + "logits/rejected": -2.638892650604248, + "logps/chosen": -275.2335205078125, + "logps/rejected": -222.55178833007812, + "loss": 0.099, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.053186558187007904, + "rewards/margins": 4.206416606903076, + "rewards/rejected": -4.259603023529053, + "step": 2030 + }, + { + "epoch": 1.05, + "learning_rate": 3.6058519793459553e-07, + "logits/chosen": -2.795947790145874, + "logits/rejected": -2.8200221061706543, + "logps/chosen": -189.23316955566406, + "logps/rejected": -218.7393341064453, + "loss": 0.0855, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7773109674453735, + "rewards/margins": 5.873230457305908, + "rewards/rejected": -5.095919132232666, + "step": 2040 + }, + { + "epoch": 1.06, + "learning_rate": 3.5962899215911265e-07, + "logits/chosen": -2.630261182785034, + "logits/rejected": -2.57206392288208, + "logps/chosen": -217.30972290039062, + "logps/rejected": -250.2468719482422, + "loss": 0.0676, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0636889785528183, + "rewards/margins": 4.82668399810791, + "rewards/rejected": -4.8903727531433105, + "step": 2050 + }, + { + "epoch": 1.06, + "learning_rate": 3.5867278638362976e-07, + "logits/chosen": -2.601591110229492, + "logits/rejected": -2.6793808937072754, + "logps/chosen": -240.95700073242188, + "logps/rejected": -273.2774353027344, + "loss": 0.0734, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05607228726148605, + "rewards/margins": 5.447501182556152, + "rewards/rejected": -5.391427516937256, + "step": 2060 + }, + { + "epoch": 1.07, + "learning_rate": 3.577165806081469e-07, + "logits/chosen": -2.7586188316345215, + "logits/rejected": -2.804933547973633, + "logps/chosen": -262.1355895996094, + "logps/rejected": -302.13507080078125, + "loss": 0.095, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5269735455513, + "rewards/margins": 6.823977470397949, + "rewards/rejected": -6.297003746032715, + "step": 2070 + }, + { + "epoch": 1.07, + "learning_rate": 3.56760374832664e-07, + "logits/chosen": -2.906247615814209, + "logits/rejected": -2.7534871101379395, + "logps/chosen": -250.2699737548828, + "logps/rejected": -261.11962890625, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4384024143218994, + "rewards/margins": 6.545752048492432, + "rewards/rejected": -5.107348442077637, + "step": 2080 + }, + { + "epoch": 1.08, + "learning_rate": 3.5580416905718106e-07, + "logits/chosen": -2.655647039413452, + "logits/rejected": -2.770217180252075, + "logps/chosen": -294.7089538574219, + "logps/rejected": -271.32037353515625, + "loss": 0.1017, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.16056032478809357, + "rewards/margins": 4.826616287231445, + "rewards/rejected": -4.666056156158447, + "step": 2090 + }, + { + "epoch": 1.08, + "learning_rate": 3.5484796328169817e-07, + "logits/chosen": -2.8367366790771484, + "logits/rejected": -2.7900490760803223, + "logps/chosen": -235.642333984375, + "logps/rejected": -272.7825012207031, + "loss": 0.0733, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0952982902526855, + "rewards/margins": 5.964905261993408, + "rewards/rejected": -4.8696064949035645, + "step": 2100 + }, + { + "epoch": 1.08, + "eval_logits/chosen": -2.659477949142456, + "eval_logits/rejected": -2.628939151763916, + "eval_logps/chosen": -265.2799377441406, + "eval_logps/rejected": -291.42083740234375, + "eval_loss": 0.5452979803085327, + "eval_rewards/accuracies": 0.765625, + "eval_rewards/chosen": -1.1044831275939941, + "eval_rewards/margins": 2.3406097888946533, + "eval_rewards/rejected": -3.4450929164886475, + "eval_runtime": 55.5581, + "eval_samples_per_second": 17.999, + "eval_steps_per_second": 0.288, + "step": 2100 + }, + { + "epoch": 1.09, + "learning_rate": 3.538917575062153e-07, + "logits/chosen": -2.700606346130371, + "logits/rejected": -2.6558995246887207, + "logps/chosen": -210.18374633789062, + "logps/rejected": -282.9020080566406, + "loss": 0.1171, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15502862632274628, + "rewards/margins": 5.092817783355713, + "rewards/rejected": -4.937788963317871, + "step": 2110 + }, + { + "epoch": 1.09, + "learning_rate": 3.529355517307324e-07, + "logits/chosen": -2.8686940670013428, + "logits/rejected": -2.740063190460205, + "logps/chosen": -269.2464904785156, + "logps/rejected": -366.86639404296875, + "loss": 0.0607, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0339075326919556, + "rewards/margins": 8.14382266998291, + "rewards/rejected": -7.109914302825928, + "step": 2120 + }, + { + "epoch": 1.1, + "learning_rate": 3.519793459552495e-07, + "logits/chosen": -2.823467969894409, + "logits/rejected": -2.833052635192871, + "logps/chosen": -280.67706298828125, + "logps/rejected": -302.888671875, + "loss": 0.0767, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.975002646446228, + "rewards/margins": 6.0545806884765625, + "rewards/rejected": -5.079577445983887, + "step": 2130 + }, + { + "epoch": 1.1, + "learning_rate": 3.510231401797667e-07, + "logits/chosen": -2.9246203899383545, + "logits/rejected": -2.8968777656555176, + "logps/chosen": -409.8377380371094, + "logps/rejected": -376.01153564453125, + "loss": 0.0978, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6437445878982544, + "rewards/margins": 7.857232570648193, + "rewards/rejected": -7.2134881019592285, + "step": 2140 + }, + { + "epoch": 1.11, + "learning_rate": 3.500669344042838e-07, + "logits/chosen": -2.8815579414367676, + "logits/rejected": -2.696906566619873, + "logps/chosen": -312.1972961425781, + "logps/rejected": -323.14031982421875, + "loss": 0.0847, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4120159149169922, + "rewards/margins": 6.041172027587891, + "rewards/rejected": -5.629156112670898, + "step": 2150 + }, + { + "epoch": 1.12, + "learning_rate": 3.491107286288009e-07, + "logits/chosen": -2.869086265563965, + "logits/rejected": -2.794461250305176, + "logps/chosen": -230.935302734375, + "logps/rejected": -229.4247283935547, + "loss": 0.1155, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5138686895370483, + "rewards/margins": 6.901867866516113, + "rewards/rejected": -5.387998580932617, + "step": 2160 + }, + { + "epoch": 1.12, + "learning_rate": 3.4815452285331803e-07, + "logits/chosen": -2.906574249267578, + "logits/rejected": -2.918184280395508, + "logps/chosen": -435.5089416503906, + "logps/rejected": -367.09820556640625, + "loss": 0.0949, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4653266966342926, + "rewards/margins": 4.6506667137146, + "rewards/rejected": -4.18533992767334, + "step": 2170 + }, + { + "epoch": 1.13, + "learning_rate": 3.4719831707783515e-07, + "logits/chosen": -2.6144165992736816, + "logits/rejected": -2.6169540882110596, + "logps/chosen": -259.0269470214844, + "logps/rejected": -253.59518432617188, + "loss": 0.1058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8547903895378113, + "rewards/margins": 5.843932151794434, + "rewards/rejected": -4.989141941070557, + "step": 2180 + }, + { + "epoch": 1.13, + "learning_rate": 3.4624211130235227e-07, + "logits/chosen": -2.905179738998413, + "logits/rejected": -2.836651086807251, + "logps/chosen": -208.110107421875, + "logps/rejected": -241.3552703857422, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6082350015640259, + "rewards/margins": 6.723033905029297, + "rewards/rejected": -6.114798545837402, + "step": 2190 + }, + { + "epoch": 1.14, + "learning_rate": 3.452859055268694e-07, + "logits/chosen": -2.7746129035949707, + "logits/rejected": -2.7575089931488037, + "logps/chosen": -204.41021728515625, + "logps/rejected": -263.98004150390625, + "loss": 0.0972, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.20659780502319336, + "rewards/margins": 5.700135707855225, + "rewards/rejected": -5.90673303604126, + "step": 2200 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -2.670933246612549, + "eval_logits/rejected": -2.647088050842285, + "eval_logps/chosen": -271.1505126953125, + "eval_logps/rejected": -296.7934265136719, + "eval_loss": 0.557054877281189, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -1.6915401220321655, + "eval_rewards/margins": 2.2908077239990234, + "eval_rewards/rejected": -3.9823474884033203, + "eval_runtime": 55.8179, + "eval_samples_per_second": 17.915, + "eval_steps_per_second": 0.287, + "step": 2200 + }, + { + "epoch": 1.14, + "learning_rate": 3.443296997513865e-07, + "logits/chosen": -2.8264622688293457, + "logits/rejected": -2.802203416824341, + "logps/chosen": -317.52960205078125, + "logps/rejected": -342.2868957519531, + "loss": 0.0822, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.2196967601776123, + "rewards/margins": 7.658734321594238, + "rewards/rejected": -6.439038276672363, + "step": 2210 + }, + { + "epoch": 1.15, + "learning_rate": 3.433734939759036e-07, + "logits/chosen": -2.8357937335968018, + "logits/rejected": -2.8530819416046143, + "logps/chosen": -235.8030548095703, + "logps/rejected": -277.0107116699219, + "loss": 0.1058, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7737981677055359, + "rewards/margins": 6.4052414894104, + "rewards/rejected": -5.631443023681641, + "step": 2220 + }, + { + "epoch": 1.15, + "learning_rate": 3.4241728820042073e-07, + "logits/chosen": -2.7552199363708496, + "logits/rejected": -2.7428534030914307, + "logps/chosen": -227.3050079345703, + "logps/rejected": -270.3177185058594, + "loss": 0.121, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6933164000511169, + "rewards/margins": 4.165007591247559, + "rewards/rejected": -4.858323574066162, + "step": 2230 + }, + { + "epoch": 1.16, + "learning_rate": 3.4146108242493784e-07, + "logits/chosen": -2.7783877849578857, + "logits/rejected": -2.8269574642181396, + "logps/chosen": -289.098388671875, + "logps/rejected": -384.05474853515625, + "loss": 0.1735, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44400936365127563, + "rewards/margins": 7.5069899559021, + "rewards/rejected": -7.062979698181152, + "step": 2240 + }, + { + "epoch": 1.16, + "learning_rate": 3.405048766494549e-07, + "logits/chosen": -2.576467990875244, + "logits/rejected": -2.569551706314087, + "logps/chosen": -264.085205078125, + "logps/rejected": -242.80126953125, + "loss": 0.2203, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.19903890788555145, + "rewards/margins": 5.752175331115723, + "rewards/rejected": -5.553135871887207, + "step": 2250 + }, + { + "epoch": 1.17, + "learning_rate": 3.39548670873972e-07, + "logits/chosen": -2.780787944793701, + "logits/rejected": -2.7364470958709717, + "logps/chosen": -327.9325866699219, + "logps/rejected": -381.19915771484375, + "loss": 0.0916, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.037799786776304245, + "rewards/margins": 5.7476677894592285, + "rewards/rejected": -5.785468101501465, + "step": 2260 + }, + { + "epoch": 1.17, + "learning_rate": 3.3859246509848914e-07, + "logits/chosen": -2.716096878051758, + "logits/rejected": -2.7619426250457764, + "logps/chosen": -270.20281982421875, + "logps/rejected": -309.69500732421875, + "loss": 0.0833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1129147857427597, + "rewards/margins": 5.1096320152282715, + "rewards/rejected": -5.22254753112793, + "step": 2270 + }, + { + "epoch": 1.18, + "learning_rate": 3.376362593230063e-07, + "logits/chosen": -2.633354902267456, + "logits/rejected": -2.5907938480377197, + "logps/chosen": -214.78662109375, + "logps/rejected": -303.368408203125, + "loss": 0.0682, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.21891649067401886, + "rewards/margins": 6.782713413238525, + "rewards/rejected": -6.5637969970703125, + "step": 2280 + }, + { + "epoch": 1.18, + "learning_rate": 3.366800535475234e-07, + "logits/chosen": -2.788681745529175, + "logits/rejected": -2.6701889038085938, + "logps/chosen": -367.4220886230469, + "logps/rejected": -265.5601501464844, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7071707248687744, + "rewards/margins": 8.423349380493164, + "rewards/rejected": -5.716177940368652, + "step": 2290 + }, + { + "epoch": 1.19, + "learning_rate": 3.3572384777204054e-07, + "logits/chosen": -2.6153483390808105, + "logits/rejected": -2.575199842453003, + "logps/chosen": -398.6623229980469, + "logps/rejected": -343.8503112792969, + "loss": 0.1058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5428920984268188, + "rewards/margins": 7.921414852142334, + "rewards/rejected": -7.3785223960876465, + "step": 2300 + }, + { + "epoch": 1.19, + "eval_logits/chosen": -2.5798184871673584, + "eval_logits/rejected": -2.5527260303497314, + "eval_logps/chosen": -264.8562927246094, + "eval_logps/rejected": -295.91058349609375, + "eval_loss": 0.5789377689361572, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -1.0621176958084106, + "eval_rewards/margins": 2.8319482803344727, + "eval_rewards/rejected": -3.8940658569335938, + "eval_runtime": 58.0073, + "eval_samples_per_second": 17.239, + "eval_steps_per_second": 0.276, + "step": 2300 + }, + { + "epoch": 1.19, + "learning_rate": 3.3476764199655765e-07, + "logits/chosen": -2.513836145401001, + "logits/rejected": -2.6243300437927246, + "logps/chosen": -219.6814422607422, + "logps/rejected": -218.55807495117188, + "loss": 0.1111, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.24565038084983826, + "rewards/margins": 4.5390119552612305, + "rewards/rejected": -4.293361663818359, + "step": 2310 + }, + { + "epoch": 1.2, + "learning_rate": 3.3381143622107477e-07, + "logits/chosen": -2.8270373344421387, + "logits/rejected": -2.7377943992614746, + "logps/chosen": -401.356201171875, + "logps/rejected": -353.20965576171875, + "loss": 0.058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6988257169723511, + "rewards/margins": 6.860513210296631, + "rewards/rejected": -6.161687850952148, + "step": 2320 + }, + { + "epoch": 1.2, + "learning_rate": 3.328552304455919e-07, + "logits/chosen": -2.717745542526245, + "logits/rejected": -2.673698902130127, + "logps/chosen": -268.2499084472656, + "logps/rejected": -293.6933898925781, + "loss": 0.0778, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8076402544975281, + "rewards/margins": 7.105103969573975, + "rewards/rejected": -6.297463417053223, + "step": 2330 + }, + { + "epoch": 1.21, + "learning_rate": 3.31899024670109e-07, + "logits/chosen": -2.759124517440796, + "logits/rejected": -2.744246006011963, + "logps/chosen": -315.86248779296875, + "logps/rejected": -262.46099853515625, + "loss": 0.0721, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9767075777053833, + "rewards/margins": 4.7864885330200195, + "rewards/rejected": -5.7631964683532715, + "step": 2340 + }, + { + "epoch": 1.21, + "learning_rate": 3.309428188946261e-07, + "logits/chosen": -2.6659247875213623, + "logits/rejected": -2.627288341522217, + "logps/chosen": -165.9207305908203, + "logps/rejected": -166.2641143798828, + "loss": 0.1052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17963728308677673, + "rewards/margins": 4.340859413146973, + "rewards/rejected": -4.520496368408203, + "step": 2350 + }, + { + "epoch": 1.22, + "learning_rate": 3.2998661311914323e-07, + "logits/chosen": -2.6454150676727295, + "logits/rejected": -2.5655908584594727, + "logps/chosen": -195.2259979248047, + "logps/rejected": -328.01806640625, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6261202096939087, + "rewards/margins": 8.00461196899414, + "rewards/rejected": -7.3784918785095215, + "step": 2360 + }, + { + "epoch": 1.22, + "learning_rate": 3.2903040734366035e-07, + "logits/chosen": -2.7958927154541016, + "logits/rejected": -2.7633419036865234, + "logps/chosen": -299.79107666015625, + "logps/rejected": -378.6677551269531, + "loss": 0.0659, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1539552211761475, + "rewards/margins": 7.467595100402832, + "rewards/rejected": -6.31364107131958, + "step": 2370 + }, + { + "epoch": 1.23, + "learning_rate": 3.2807420156817746e-07, + "logits/chosen": -2.7090749740600586, + "logits/rejected": -2.64817476272583, + "logps/chosen": -226.72525024414062, + "logps/rejected": -330.6046142578125, + "loss": 0.1011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004512411542236805, + "rewards/margins": 5.72462272644043, + "rewards/rejected": -5.729135990142822, + "step": 2380 + }, + { + "epoch": 1.23, + "learning_rate": 3.271179957926946e-07, + "logits/chosen": -2.447643995285034, + "logits/rejected": -2.5474460124969482, + "logps/chosen": -325.9497985839844, + "logps/rejected": -339.540771484375, + "loss": 0.0895, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07166711986064911, + "rewards/margins": 5.96669864654541, + "rewards/rejected": -5.895030975341797, + "step": 2390 + }, + { + "epoch": 1.24, + "learning_rate": 3.261617900172117e-07, + "logits/chosen": -2.672534465789795, + "logits/rejected": -2.7197232246398926, + "logps/chosen": -225.26168823242188, + "logps/rejected": -283.93212890625, + "loss": 0.2423, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.4908114969730377, + "rewards/margins": 6.257862567901611, + "rewards/rejected": -5.767050743103027, + "step": 2400 + }, + { + "epoch": 1.24, + "eval_logits/chosen": -2.5784032344818115, + "eval_logits/rejected": -2.5413780212402344, + "eval_logps/chosen": -266.1980895996094, + "eval_logps/rejected": -292.55987548828125, + "eval_loss": 0.545539379119873, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -1.1962969303131104, + "eval_rewards/margins": 2.362699508666992, + "eval_rewards/rejected": -3.5589966773986816, + "eval_runtime": 57.2058, + "eval_samples_per_second": 17.481, + "eval_steps_per_second": 0.28, + "step": 2400 + }, + { + "epoch": 1.24, + "learning_rate": 3.2520558424172876e-07, + "logits/chosen": -2.705650568008423, + "logits/rejected": -2.6275668144226074, + "logps/chosen": -204.61109924316406, + "logps/rejected": -228.20059204101562, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12385628372430801, + "rewards/margins": 5.060498237609863, + "rewards/rejected": -5.184354782104492, + "step": 2410 + }, + { + "epoch": 1.25, + "learning_rate": 3.242493784662459e-07, + "logits/chosen": -2.7630136013031006, + "logits/rejected": -2.7599825859069824, + "logps/chosen": -271.2209777832031, + "logps/rejected": -319.4446716308594, + "loss": 0.0914, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.31576937437057495, + "rewards/margins": 5.5094099044799805, + "rewards/rejected": -5.193641185760498, + "step": 2420 + }, + { + "epoch": 1.25, + "learning_rate": 3.2329317269076304e-07, + "logits/chosen": -2.7317633628845215, + "logits/rejected": -2.655245780944824, + "logps/chosen": -273.3720397949219, + "logps/rejected": -324.91253662109375, + "loss": 0.0998, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1897385120391846, + "rewards/margins": 7.156263828277588, + "rewards/rejected": -5.966525077819824, + "step": 2430 + }, + { + "epoch": 1.26, + "learning_rate": 3.2233696691528016e-07, + "logits/chosen": -2.7852416038513184, + "logits/rejected": -2.7786805629730225, + "logps/chosen": -261.6937255859375, + "logps/rejected": -367.239501953125, + "loss": 0.0981, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2177461385726929, + "rewards/margins": 8.014082908630371, + "rewards/rejected": -6.796337127685547, + "step": 2440 + }, + { + "epoch": 1.26, + "learning_rate": 3.2138076113979727e-07, + "logits/chosen": -2.648454427719116, + "logits/rejected": -2.6311562061309814, + "logps/chosen": -240.67056274414062, + "logps/rejected": -267.9673156738281, + "loss": 0.0978, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.28647559881210327, + "rewards/margins": 6.354050636291504, + "rewards/rejected": -6.067575454711914, + "step": 2450 + }, + { + "epoch": 1.27, + "learning_rate": 3.204245553643144e-07, + "logits/chosen": -2.7582898139953613, + "logits/rejected": -2.830904722213745, + "logps/chosen": -309.43853759765625, + "logps/rejected": -350.77618408203125, + "loss": 0.098, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6567636728286743, + "rewards/margins": 6.094487190246582, + "rewards/rejected": -5.437722682952881, + "step": 2460 + }, + { + "epoch": 1.28, + "learning_rate": 3.194683495888315e-07, + "logits/chosen": -2.784090518951416, + "logits/rejected": -2.7697300910949707, + "logps/chosen": -277.5585021972656, + "logps/rejected": -286.3648986816406, + "loss": 0.1034, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.03639407083392143, + "rewards/margins": 6.042351245880127, + "rewards/rejected": -6.078745365142822, + "step": 2470 + }, + { + "epoch": 1.28, + "learning_rate": 3.185121438133486e-07, + "logits/chosen": -2.758202075958252, + "logits/rejected": -2.7667107582092285, + "logps/chosen": -282.9512023925781, + "logps/rejected": -355.96319580078125, + "loss": 0.1263, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.692094087600708, + "rewards/margins": 7.221386909484863, + "rewards/rejected": -6.529292106628418, + "step": 2480 + }, + { + "epoch": 1.29, + "learning_rate": 3.1755593803786574e-07, + "logits/chosen": -2.6361289024353027, + "logits/rejected": -2.656646966934204, + "logps/chosen": -201.61358642578125, + "logps/rejected": -313.6552734375, + "loss": 0.0946, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4261380434036255, + "rewards/margins": 5.699584484100342, + "rewards/rejected": -7.125722408294678, + "step": 2490 + }, + { + "epoch": 1.29, + "learning_rate": 3.1659973226238285e-07, + "logits/chosen": -2.6961874961853027, + "logits/rejected": -2.658639669418335, + "logps/chosen": -205.56558227539062, + "logps/rejected": -260.9989013671875, + "loss": 0.1177, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1720048189163208, + "rewards/margins": 3.783812999725342, + "rewards/rejected": -4.955817222595215, + "step": 2500 + }, + { + "epoch": 1.29, + "eval_logits/chosen": -2.518930435180664, + "eval_logits/rejected": -2.480231285095215, + "eval_logps/chosen": -272.3760681152344, + "eval_logps/rejected": -300.9119567871094, + "eval_loss": 0.5888839960098267, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -1.814096450805664, + "eval_rewards/margins": 2.580104112625122, + "eval_rewards/rejected": -4.394200325012207, + "eval_runtime": 58.8794, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 0.272, + "step": 2500 + }, + { + "epoch": 1.3, + "learning_rate": 3.1564352648689997e-07, + "logits/chosen": -2.7483344078063965, + "logits/rejected": -2.7376341819763184, + "logps/chosen": -269.5032653808594, + "logps/rejected": -253.87051391601562, + "loss": 0.1018, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7494356632232666, + "rewards/margins": 5.012850761413574, + "rewards/rejected": -4.263415336608887, + "step": 2510 + }, + { + "epoch": 1.3, + "learning_rate": 3.146873207114171e-07, + "logits/chosen": -2.7938504219055176, + "logits/rejected": -2.7508413791656494, + "logps/chosen": -274.398193359375, + "logps/rejected": -306.8814697265625, + "loss": 0.1026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5945212244987488, + "rewards/margins": 7.071564674377441, + "rewards/rejected": -6.477043151855469, + "step": 2520 + }, + { + "epoch": 1.31, + "learning_rate": 3.137311149359342e-07, + "logits/chosen": -2.858582019805908, + "logits/rejected": -2.723261833190918, + "logps/chosen": -274.79425048828125, + "logps/rejected": -379.57501220703125, + "loss": 0.0912, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21533474326133728, + "rewards/margins": 5.66866397857666, + "rewards/rejected": -5.883998870849609, + "step": 2530 + }, + { + "epoch": 1.31, + "learning_rate": 3.127749091604513e-07, + "logits/chosen": -2.8357625007629395, + "logits/rejected": -2.814939260482788, + "logps/chosen": -232.6085662841797, + "logps/rejected": -294.39849853515625, + "loss": 0.0899, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3493742048740387, + "rewards/margins": 6.373709678649902, + "rewards/rejected": -6.723084449768066, + "step": 2540 + }, + { + "epoch": 1.32, + "learning_rate": 3.1181870338496843e-07, + "logits/chosen": -2.679546594619751, + "logits/rejected": -2.7267134189605713, + "logps/chosen": -209.9823760986328, + "logps/rejected": -287.78826904296875, + "loss": 0.091, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.18076567351818085, + "rewards/margins": 5.487452983856201, + "rewards/rejected": -5.306687831878662, + "step": 2550 + }, + { + "epoch": 1.32, + "learning_rate": 3.108624976094856e-07, + "logits/chosen": -2.5259623527526855, + "logits/rejected": -2.6586062908172607, + "logps/chosen": -365.71258544921875, + "logps/rejected": -242.0026397705078, + "loss": 0.1047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6775692701339722, + "rewards/margins": 5.460636615753174, + "rewards/rejected": -4.783067226409912, + "step": 2560 + }, + { + "epoch": 1.33, + "learning_rate": 3.0990629183400266e-07, + "logits/chosen": -2.5484824180603027, + "logits/rejected": -2.580888509750366, + "logps/chosen": -270.9229736328125, + "logps/rejected": -282.5039367675781, + "loss": 0.1062, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9959784746170044, + "rewards/margins": 7.229222297668457, + "rewards/rejected": -6.233242988586426, + "step": 2570 + }, + { + "epoch": 1.33, + "learning_rate": 3.089500860585198e-07, + "logits/chosen": -2.762617588043213, + "logits/rejected": -2.739429235458374, + "logps/chosen": -200.62588500976562, + "logps/rejected": -308.99127197265625, + "loss": 0.1209, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03968176990747452, + "rewards/margins": 7.24197244644165, + "rewards/rejected": -7.2022905349731445, + "step": 2580 + }, + { + "epoch": 1.34, + "learning_rate": 3.079938802830369e-07, + "logits/chosen": -2.7819771766662598, + "logits/rejected": -2.755398750305176, + "logps/chosen": -207.9453582763672, + "logps/rejected": -235.0533447265625, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6877096891403198, + "rewards/margins": 5.369903564453125, + "rewards/rejected": -6.057612895965576, + "step": 2590 + }, + { + "epoch": 1.34, + "learning_rate": 3.07037674507554e-07, + "logits/chosen": -2.706509828567505, + "logits/rejected": -2.741109848022461, + "logps/chosen": -328.0285949707031, + "logps/rejected": -378.04339599609375, + "loss": 0.1213, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.15584062039852142, + "rewards/margins": 7.148020267486572, + "rewards/rejected": -6.992179870605469, + "step": 2600 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -2.5206711292266846, + "eval_logits/rejected": -2.4774041175842285, + "eval_logps/chosen": -268.8435974121094, + "eval_logps/rejected": -295.39013671875, + "eval_loss": 0.5683205723762512, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -1.4608486890792847, + "eval_rewards/margins": 2.3811748027801514, + "eval_rewards/rejected": -3.8420238494873047, + "eval_runtime": 59.9277, + "eval_samples_per_second": 16.687, + "eval_steps_per_second": 0.267, + "step": 2600 + }, + { + "epoch": 1.35, + "learning_rate": 3.060814687320711e-07, + "logits/chosen": -2.6932194232940674, + "logits/rejected": -2.6989266872406006, + "logps/chosen": -347.0301818847656, + "logps/rejected": -320.0285949707031, + "loss": 0.0986, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9022884368896484, + "rewards/margins": 8.200953483581543, + "rewards/rejected": -6.2986650466918945, + "step": 2610 + }, + { + "epoch": 1.35, + "learning_rate": 3.0512526295658824e-07, + "logits/chosen": -2.5327630043029785, + "logits/rejected": -2.500969409942627, + "logps/chosen": -234.2447509765625, + "logps/rejected": -247.3339080810547, + "loss": 0.094, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.20838475227355957, + "rewards/margins": 5.5928425788879395, + "rewards/rejected": -5.80122709274292, + "step": 2620 + }, + { + "epoch": 1.36, + "learning_rate": 3.0416905718110536e-07, + "logits/chosen": -2.600487232208252, + "logits/rejected": -2.682471990585327, + "logps/chosen": -246.78024291992188, + "logps/rejected": -246.34237670898438, + "loss": 0.1632, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0733012929558754, + "rewards/margins": 5.1650261878967285, + "rewards/rejected": -5.0917253494262695, + "step": 2630 + }, + { + "epoch": 1.36, + "learning_rate": 3.0321285140562247e-07, + "logits/chosen": -2.7498373985290527, + "logits/rejected": -2.6727969646453857, + "logps/chosen": -197.46665954589844, + "logps/rejected": -293.0552062988281, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.277592420578003, + "rewards/margins": 4.636383533477783, + "rewards/rejected": -5.913976192474365, + "step": 2640 + }, + { + "epoch": 1.37, + "learning_rate": 3.022566456301396e-07, + "logits/chosen": -2.568721294403076, + "logits/rejected": -2.7006120681762695, + "logps/chosen": -296.3185119628906, + "logps/rejected": -307.19818115234375, + "loss": 0.1191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03509577363729477, + "rewards/margins": 6.352346897125244, + "rewards/rejected": -6.387442111968994, + "step": 2650 + }, + { + "epoch": 1.37, + "learning_rate": 3.013004398546567e-07, + "logits/chosen": -2.590341091156006, + "logits/rejected": -2.6296682357788086, + "logps/chosen": -299.8781433105469, + "logps/rejected": -309.7999572753906, + "loss": 0.1044, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5305342078208923, + "rewards/margins": 5.682303428649902, + "rewards/rejected": -6.212838172912598, + "step": 2660 + }, + { + "epoch": 1.38, + "learning_rate": 3.003442340791738e-07, + "logits/chosen": -2.69757080078125, + "logits/rejected": -2.7322795391082764, + "logps/chosen": -250.77490234375, + "logps/rejected": -287.36785888671875, + "loss": 0.0781, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4853235185146332, + "rewards/margins": 6.51000452041626, + "rewards/rejected": -6.995328426361084, + "step": 2670 + }, + { + "epoch": 1.38, + "learning_rate": 2.9938802830369093e-07, + "logits/chosen": -2.7183175086975098, + "logits/rejected": -2.7362000942230225, + "logps/chosen": -189.50961303710938, + "logps/rejected": -314.11114501953125, + "loss": 0.1049, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7947381734848022, + "rewards/margins": 6.518137454986572, + "rewards/rejected": -7.312876224517822, + "step": 2680 + }, + { + "epoch": 1.39, + "learning_rate": 2.9843182252820805e-07, + "logits/chosen": -2.5106515884399414, + "logits/rejected": -2.5575432777404785, + "logps/chosen": -249.78012084960938, + "logps/rejected": -237.69677734375, + "loss": 0.0708, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6967580914497375, + "rewards/margins": 5.879612922668457, + "rewards/rejected": -6.576371192932129, + "step": 2690 + }, + { + "epoch": 1.39, + "learning_rate": 2.974756167527252e-07, + "logits/chosen": -2.7255759239196777, + "logits/rejected": -2.62638521194458, + "logps/chosen": -315.8297424316406, + "logps/rejected": -245.0868682861328, + "loss": 0.0889, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14952102303504944, + "rewards/margins": 5.201340675354004, + "rewards/rejected": -5.3508620262146, + "step": 2700 + }, + { + "epoch": 1.39, + "eval_logits/chosen": -2.452204704284668, + "eval_logits/rejected": -2.4123356342315674, + "eval_logps/chosen": -270.2423400878906, + "eval_logps/rejected": -294.30682373046875, + "eval_loss": 0.5890262126922607, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -1.6007238626480103, + "eval_rewards/margins": 2.132964849472046, + "eval_rewards/rejected": -3.7336881160736084, + "eval_runtime": 60.2724, + "eval_samples_per_second": 16.591, + "eval_steps_per_second": 0.265, + "step": 2700 + }, + { + "epoch": 1.4, + "learning_rate": 2.9651941097724233e-07, + "logits/chosen": -2.6822290420532227, + "logits/rejected": -2.7052135467529297, + "logps/chosen": -278.01422119140625, + "logps/rejected": -339.3485412597656, + "loss": 0.1087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15647803246974945, + "rewards/margins": 5.597909450531006, + "rewards/rejected": -5.754388332366943, + "step": 2710 + }, + { + "epoch": 1.4, + "learning_rate": 2.9556320520175945e-07, + "logits/chosen": -2.5796897411346436, + "logits/rejected": -2.6120152473449707, + "logps/chosen": -276.00408935546875, + "logps/rejected": -235.9673614501953, + "loss": 0.1325, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0651065111160278, + "rewards/margins": 4.378058433532715, + "rewards/rejected": -5.443163871765137, + "step": 2720 + }, + { + "epoch": 1.41, + "learning_rate": 2.946069994262765e-07, + "logits/chosen": -2.682631015777588, + "logits/rejected": -2.5832624435424805, + "logps/chosen": -325.86883544921875, + "logps/rejected": -448.3667907714844, + "loss": 0.1033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8688300848007202, + "rewards/margins": 9.84605598449707, + "rewards/rejected": -7.977224826812744, + "step": 2730 + }, + { + "epoch": 1.41, + "learning_rate": 2.9365079365079363e-07, + "logits/chosen": -2.656704902648926, + "logits/rejected": -2.6298129558563232, + "logps/chosen": -312.0522766113281, + "logps/rejected": -264.58172607421875, + "loss": 0.0869, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.325814425945282, + "rewards/margins": 7.053065299987793, + "rewards/rejected": -6.727250099182129, + "step": 2740 + }, + { + "epoch": 1.42, + "learning_rate": 2.9269458787531074e-07, + "logits/chosen": -2.7410786151885986, + "logits/rejected": -2.60438871383667, + "logps/chosen": -338.72125244140625, + "logps/rejected": -300.7152099609375, + "loss": 0.099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48889246582984924, + "rewards/margins": 5.851205348968506, + "rewards/rejected": -5.3623127937316895, + "step": 2750 + }, + { + "epoch": 1.42, + "learning_rate": 2.9173838209982786e-07, + "logits/chosen": -2.5442605018615723, + "logits/rejected": -2.396920680999756, + "logps/chosen": -212.8131561279297, + "logps/rejected": -245.8048858642578, + "loss": 0.0924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18661096692085266, + "rewards/margins": 5.660151481628418, + "rewards/rejected": -5.846762657165527, + "step": 2760 + }, + { + "epoch": 1.43, + "learning_rate": 2.90782176324345e-07, + "logits/chosen": -2.770378589630127, + "logits/rejected": -2.652078628540039, + "logps/chosen": -284.2225036621094, + "logps/rejected": -237.8290557861328, + "loss": 0.0933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5430759787559509, + "rewards/margins": 5.237969398498535, + "rewards/rejected": -5.781044960021973, + "step": 2770 + }, + { + "epoch": 1.44, + "learning_rate": 2.898259705488621e-07, + "logits/chosen": -2.763434410095215, + "logits/rejected": -2.824732542037964, + "logps/chosen": -295.1107482910156, + "logps/rejected": -265.9899597167969, + "loss": 0.0896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034651029855012894, + "rewards/margins": 6.801316261291504, + "rewards/rejected": -6.835967063903809, + "step": 2780 + }, + { + "epoch": 1.44, + "learning_rate": 2.888697647733792e-07, + "logits/chosen": -2.626591920852661, + "logits/rejected": -2.693389892578125, + "logps/chosen": -382.92559814453125, + "logps/rejected": -343.2526550292969, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3407518267631531, + "rewards/margins": 8.370372772216797, + "rewards/rejected": -8.029620170593262, + "step": 2790 + }, + { + "epoch": 1.45, + "learning_rate": 2.879135589978963e-07, + "logits/chosen": -2.7098212242126465, + "logits/rejected": -2.6440200805664062, + "logps/chosen": -237.72042846679688, + "logps/rejected": -263.23858642578125, + "loss": 0.0995, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1682741343975067, + "rewards/margins": 6.354408264160156, + "rewards/rejected": -6.186134338378906, + "step": 2800 + }, + { + "epoch": 1.45, + "eval_logits/chosen": -2.505031108856201, + "eval_logits/rejected": -2.4685418605804443, + "eval_logps/chosen": -269.7538146972656, + "eval_logps/rejected": -295.3314514160156, + "eval_loss": 0.6072700023651123, + "eval_rewards/accuracies": 0.828125, + "eval_rewards/chosen": -1.5518717765808105, + "eval_rewards/margins": 2.2842793464660645, + "eval_rewards/rejected": -3.836151123046875, + "eval_runtime": 55.9165, + "eval_samples_per_second": 17.884, + "eval_steps_per_second": 0.286, + "step": 2800 + }, + { + "epoch": 1.45, + "learning_rate": 2.8695735322241344e-07, + "logits/chosen": -2.6192431449890137, + "logits/rejected": -2.5992114543914795, + "logps/chosen": -243.22531127929688, + "logps/rejected": -259.7867431640625, + "loss": 0.1528, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.13970229029655457, + "rewards/margins": 6.5014328956604, + "rewards/rejected": -6.641134738922119, + "step": 2810 + }, + { + "epoch": 1.46, + "learning_rate": 2.8600114744693055e-07, + "logits/chosen": -2.5777206420898438, + "logits/rejected": -2.595568895339966, + "logps/chosen": -315.70513916015625, + "logps/rejected": -328.4643859863281, + "loss": 0.0978, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2178268432617188, + "rewards/margins": 7.306548118591309, + "rewards/rejected": -6.08872127532959, + "step": 2820 + }, + { + "epoch": 1.46, + "learning_rate": 2.8504494167144767e-07, + "logits/chosen": -2.5762603282928467, + "logits/rejected": -2.6186347007751465, + "logps/chosen": -338.2220153808594, + "logps/rejected": -372.440673828125, + "loss": 0.1064, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40805625915527344, + "rewards/margins": 8.335431098937988, + "rewards/rejected": -7.927374362945557, + "step": 2830 + }, + { + "epoch": 1.47, + "learning_rate": 2.8408873589596484e-07, + "logits/chosen": -2.6718220710754395, + "logits/rejected": -2.6635639667510986, + "logps/chosen": -187.11207580566406, + "logps/rejected": -261.17236328125, + "loss": 0.0828, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6262374520301819, + "rewards/margins": 5.5064697265625, + "rewards/rejected": -6.132707595825195, + "step": 2840 + }, + { + "epoch": 1.47, + "learning_rate": 2.8313253012048195e-07, + "logits/chosen": -2.549769163131714, + "logits/rejected": -2.656653881072998, + "logps/chosen": -244.2962188720703, + "logps/rejected": -360.3494567871094, + "loss": 0.1782, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2172654867172241, + "rewards/margins": 5.140130043029785, + "rewards/rejected": -6.357396125793457, + "step": 2850 + }, + { + "epoch": 1.48, + "learning_rate": 2.8217632434499907e-07, + "logits/chosen": -2.645021915435791, + "logits/rejected": -2.6408421993255615, + "logps/chosen": -283.7989807128906, + "logps/rejected": -308.847900390625, + "loss": 0.1242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1975909173488617, + "rewards/margins": 6.074164867401123, + "rewards/rejected": -6.271755695343018, + "step": 2860 + }, + { + "epoch": 1.48, + "learning_rate": 2.812201185695162e-07, + "logits/chosen": -2.6722216606140137, + "logits/rejected": -2.614084482192993, + "logps/chosen": -170.52699279785156, + "logps/rejected": -282.1553039550781, + "loss": 0.0736, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22432120144367218, + "rewards/margins": 6.5738677978515625, + "rewards/rejected": -6.349545955657959, + "step": 2870 + }, + { + "epoch": 1.49, + "learning_rate": 2.802639127940333e-07, + "logits/chosen": -2.706528425216675, + "logits/rejected": -2.6337788105010986, + "logps/chosen": -313.04913330078125, + "logps/rejected": -283.40972900390625, + "loss": 0.0995, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.12990444898605347, + "rewards/margins": 6.201463222503662, + "rewards/rejected": -6.071558475494385, + "step": 2880 + }, + { + "epoch": 1.49, + "learning_rate": 2.7930770701855036e-07, + "logits/chosen": -2.7476582527160645, + "logits/rejected": -2.683351755142212, + "logps/chosen": -351.7320251464844, + "logps/rejected": -329.08026123046875, + "loss": 0.1173, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49377602338790894, + "rewards/margins": 8.726078987121582, + "rewards/rejected": -8.23230266571045, + "step": 2890 + }, + { + "epoch": 1.5, + "learning_rate": 2.783515012430675e-07, + "logits/chosen": -2.5259041786193848, + "logits/rejected": -2.5326766967773438, + "logps/chosen": -190.27813720703125, + "logps/rejected": -241.18991088867188, + "loss": 0.1145, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.45352381467819214, + "rewards/margins": 5.041954517364502, + "rewards/rejected": -5.495478630065918, + "step": 2900 + }, + { + "epoch": 1.5, + "eval_logits/chosen": -2.4674015045166016, + "eval_logits/rejected": -2.42722749710083, + "eval_logps/chosen": -272.1744384765625, + "eval_logps/rejected": -299.8460998535156, + "eval_loss": 0.579024076461792, + "eval_rewards/accuracies": 0.84375, + "eval_rewards/chosen": -1.7939329147338867, + "eval_rewards/margins": 2.4936835765838623, + "eval_rewards/rejected": -4.28761625289917, + "eval_runtime": 57.5798, + "eval_samples_per_second": 17.367, + "eval_steps_per_second": 0.278, + "step": 2900 + }, + { + "epoch": 1.5, + "learning_rate": 2.773952954675846e-07, + "logits/chosen": -2.6517531871795654, + "logits/rejected": -2.611769914627075, + "logps/chosen": -357.39666748046875, + "logps/rejected": -310.7156677246094, + "loss": 0.086, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0798923969268799, + "rewards/margins": 8.055770874023438, + "rewards/rejected": -6.9758782386779785, + "step": 2910 + }, + { + "epoch": 1.51, + "learning_rate": 2.764390896921017e-07, + "logits/chosen": -2.603874921798706, + "logits/rejected": -2.5526695251464844, + "logps/chosen": -272.8443298339844, + "logps/rejected": -329.92401123046875, + "loss": 0.0943, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.03761887550354, + "rewards/margins": 8.095129013061523, + "rewards/rejected": -7.057511329650879, + "step": 2920 + }, + { + "epoch": 1.51, + "learning_rate": 2.754828839166188e-07, + "logits/chosen": -2.6576075553894043, + "logits/rejected": -2.6514670848846436, + "logps/chosen": -191.14877319335938, + "logps/rejected": -301.5423889160156, + "loss": 0.0724, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5387217402458191, + "rewards/margins": 7.507475852966309, + "rewards/rejected": -6.968753814697266, + "step": 2930 + }, + { + "epoch": 1.52, + "learning_rate": 2.7452667814113594e-07, + "logits/chosen": -2.7524516582489014, + "logits/rejected": -2.7706661224365234, + "logps/chosen": -318.32501220703125, + "logps/rejected": -316.9727783203125, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01138849277049303, + "rewards/margins": 5.306549072265625, + "rewards/rejected": -5.295160293579102, + "step": 2940 + }, + { + "epoch": 1.52, + "learning_rate": 2.7357047236565306e-07, + "logits/chosen": -2.699721097946167, + "logits/rejected": -2.7687458992004395, + "logps/chosen": -280.2265319824219, + "logps/rejected": -364.947509765625, + "loss": 0.0659, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4582802355289459, + "rewards/margins": 6.130402088165283, + "rewards/rejected": -5.672121047973633, + "step": 2950 + }, + { + "epoch": 1.53, + "learning_rate": 2.7261426659017017e-07, + "logits/chosen": -2.7413382530212402, + "logits/rejected": -2.7660346031188965, + "logps/chosen": -324.0436706542969, + "logps/rejected": -243.81802368164062, + "loss": 0.0914, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6909480094909668, + "rewards/margins": 4.682136535644531, + "rewards/rejected": -5.373085021972656, + "step": 2960 + }, + { + "epoch": 1.53, + "learning_rate": 2.716580608146873e-07, + "logits/chosen": -2.646226644515991, + "logits/rejected": -2.712257146835327, + "logps/chosen": -374.819580078125, + "logps/rejected": -351.70025634765625, + "loss": 0.0677, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3280370235443115, + "rewards/margins": 9.090957641601562, + "rewards/rejected": -7.762920379638672, + "step": 2970 + }, + { + "epoch": 1.54, + "learning_rate": 2.7070185503920446e-07, + "logits/chosen": -2.639868974685669, + "logits/rejected": -2.7557711601257324, + "logps/chosen": -273.087646484375, + "logps/rejected": -317.41766357421875, + "loss": 0.0616, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6701494455337524, + "rewards/margins": 7.305191993713379, + "rewards/rejected": -7.975341796875, + "step": 2980 + }, + { + "epoch": 1.54, + "learning_rate": 2.6974564926372157e-07, + "logits/chosen": -2.759113311767578, + "logits/rejected": -2.7275261878967285, + "logps/chosen": -301.2916259765625, + "logps/rejected": -324.0910949707031, + "loss": 0.0916, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8304478526115417, + "rewards/margins": 7.585775852203369, + "rewards/rejected": -6.755328178405762, + "step": 2990 + }, + { + "epoch": 1.55, + "learning_rate": 2.687894434882387e-07, + "logits/chosen": -2.653514862060547, + "logits/rejected": -2.606902599334717, + "logps/chosen": -277.50506591796875, + "logps/rejected": -264.87481689453125, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18126115202903748, + "rewards/margins": 4.916709899902344, + "rewards/rejected": -5.097971439361572, + "step": 3000 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.457401990890503, + "eval_logits/rejected": -2.4193201065063477, + "eval_logps/chosen": -271.5200500488281, + "eval_logps/rejected": -299.0209045410156, + "eval_loss": 0.5735086798667908, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -1.7284938097000122, + "eval_rewards/margins": 2.476605176925659, + "eval_rewards/rejected": -4.205099105834961, + "eval_runtime": 58.4864, + "eval_samples_per_second": 17.098, + "eval_steps_per_second": 0.274, + "step": 3000 + }, + { + "epoch": 1.55, + "learning_rate": 2.678332377127558e-07, + "logits/chosen": -2.693279981613159, + "logits/rejected": -2.6434133052825928, + "logps/chosen": -257.072509765625, + "logps/rejected": -268.16107177734375, + "loss": 0.0794, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.660453736782074, + "rewards/margins": 7.455300807952881, + "rewards/rejected": -6.7948479652404785, + "step": 3010 + }, + { + "epoch": 1.56, + "learning_rate": 2.668770319372729e-07, + "logits/chosen": -2.476691484451294, + "logits/rejected": -2.369554042816162, + "logps/chosen": -243.014892578125, + "logps/rejected": -179.72573852539062, + "loss": 0.0769, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.28263527154922485, + "rewards/margins": 4.549951076507568, + "rewards/rejected": -4.832587242126465, + "step": 3020 + }, + { + "epoch": 1.56, + "learning_rate": 2.6592082616179004e-07, + "logits/chosen": -2.799598217010498, + "logits/rejected": -2.6863772869110107, + "logps/chosen": -202.5391082763672, + "logps/rejected": -217.33743286132812, + "loss": 0.0956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2968635559082031, + "rewards/margins": 5.342751979827881, + "rewards/rejected": -5.639615535736084, + "step": 3030 + }, + { + "epoch": 1.57, + "learning_rate": 2.649646203863071e-07, + "logits/chosen": -2.78475022315979, + "logits/rejected": -2.7314937114715576, + "logps/chosen": -341.8492126464844, + "logps/rejected": -377.26318359375, + "loss": 0.0786, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.6281161308288574, + "rewards/margins": 6.323546409606934, + "rewards/rejected": -5.695430278778076, + "step": 3040 + }, + { + "epoch": 1.57, + "learning_rate": 2.640084146108242e-07, + "logits/chosen": -2.835313558578491, + "logits/rejected": -2.8478219509124756, + "logps/chosen": -413.6333923339844, + "logps/rejected": -281.8817138671875, + "loss": 0.0753, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3795287013053894, + "rewards/margins": 6.382612705230713, + "rewards/rejected": -6.003084659576416, + "step": 3050 + }, + { + "epoch": 1.58, + "learning_rate": 2.6305220883534133e-07, + "logits/chosen": -2.7521536350250244, + "logits/rejected": -2.802274227142334, + "logps/chosen": -283.74365234375, + "logps/rejected": -274.3045654296875, + "loss": 0.0746, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.19368138909339905, + "rewards/margins": 5.350792407989502, + "rewards/rejected": -5.157111167907715, + "step": 3060 + }, + { + "epoch": 1.58, + "learning_rate": 2.6209600305985845e-07, + "logits/chosen": -2.6364054679870605, + "logits/rejected": -2.6233391761779785, + "logps/chosen": -320.7242736816406, + "logps/rejected": -418.53863525390625, + "loss": 0.1234, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.2693870961666107, + "rewards/margins": 10.055575370788574, + "rewards/rejected": -9.786188125610352, + "step": 3070 + }, + { + "epoch": 1.59, + "learning_rate": 2.6113979728437556e-07, + "logits/chosen": -2.757228374481201, + "logits/rejected": -2.746696949005127, + "logps/chosen": -278.10736083984375, + "logps/rejected": -269.76751708984375, + "loss": 0.1059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18973210453987122, + "rewards/margins": 5.467267036437988, + "rewards/rejected": -5.2775349617004395, + "step": 3080 + }, + { + "epoch": 1.6, + "learning_rate": 2.601835915088927e-07, + "logits/chosen": -2.6811797618865967, + "logits/rejected": -2.694938898086548, + "logps/chosen": -268.3731689453125, + "logps/rejected": -316.51507568359375, + "loss": 0.0925, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0646179914474487, + "rewards/margins": 7.384450435638428, + "rewards/rejected": -6.319832801818848, + "step": 3090 + }, + { + "epoch": 1.6, + "learning_rate": 2.592273857334098e-07, + "logits/chosen": -2.8825931549072266, + "logits/rejected": -2.838369607925415, + "logps/chosen": -272.46466064453125, + "logps/rejected": -302.24951171875, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7096697092056274, + "rewards/margins": 8.205864906311035, + "rewards/rejected": -6.496194362640381, + "step": 3100 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -2.569591760635376, + "eval_logits/rejected": -2.536669969558716, + "eval_logps/chosen": -271.4610290527344, + "eval_logps/rejected": -299.8199768066406, + "eval_loss": 0.5536529421806335, + "eval_rewards/accuracies": 0.84375, + "eval_rewards/chosen": -1.7225927114486694, + "eval_rewards/margins": 2.5624139308929443, + "eval_rewards/rejected": -4.285006523132324, + "eval_runtime": 56.0043, + "eval_samples_per_second": 17.856, + "eval_steps_per_second": 0.286, + "step": 3100 + }, + { + "epoch": 1.61, + "learning_rate": 2.582711799579269e-07, + "logits/chosen": -2.8248748779296875, + "logits/rejected": -2.778346300125122, + "logps/chosen": -299.2210693359375, + "logps/rejected": -313.359375, + "loss": 0.071, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7604249715805054, + "rewards/margins": 7.902795314788818, + "rewards/rejected": -7.142370700836182, + "step": 3110 + }, + { + "epoch": 1.61, + "learning_rate": 2.573149741824441e-07, + "logits/chosen": -2.6978442668914795, + "logits/rejected": -2.6833174228668213, + "logps/chosen": -336.5847473144531, + "logps/rejected": -245.4396209716797, + "loss": 0.0829, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.08581381291151047, + "rewards/margins": 5.277584075927734, + "rewards/rejected": -5.191770076751709, + "step": 3120 + }, + { + "epoch": 1.62, + "learning_rate": 2.563587684069612e-07, + "logits/chosen": -2.8688364028930664, + "logits/rejected": -2.7075347900390625, + "logps/chosen": -350.5228576660156, + "logps/rejected": -273.2851867675781, + "loss": 0.0564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7785223126411438, + "rewards/margins": 6.459234714508057, + "rewards/rejected": -7.237756252288818, + "step": 3130 + }, + { + "epoch": 1.62, + "learning_rate": 2.554025626314783e-07, + "logits/chosen": -2.727123737335205, + "logits/rejected": -2.725803852081299, + "logps/chosen": -344.9717712402344, + "logps/rejected": -390.8403625488281, + "loss": 0.0898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9268060922622681, + "rewards/margins": 6.525388240814209, + "rewards/rejected": -7.4521942138671875, + "step": 3140 + }, + { + "epoch": 1.63, + "learning_rate": 2.544463568559954e-07, + "logits/chosen": -2.786041259765625, + "logits/rejected": -2.7138454914093018, + "logps/chosen": -323.46832275390625, + "logps/rejected": -323.85125732421875, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7023747563362122, + "rewards/margins": 8.29463005065918, + "rewards/rejected": -7.592255592346191, + "step": 3150 + }, + { + "epoch": 1.63, + "learning_rate": 2.5349015108051254e-07, + "logits/chosen": -2.623403787612915, + "logits/rejected": -2.6345882415771484, + "logps/chosen": -251.9879913330078, + "logps/rejected": -317.69769287109375, + "loss": 0.0833, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3217948377132416, + "rewards/margins": 7.894297122955322, + "rewards/rejected": -7.572502136230469, + "step": 3160 + }, + { + "epoch": 1.64, + "learning_rate": 2.5253394530502966e-07, + "logits/chosen": -2.483633518218994, + "logits/rejected": -2.636124610900879, + "logps/chosen": -375.053955078125, + "logps/rejected": -289.378662109375, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.537722647190094, + "rewards/margins": 7.012340545654297, + "rewards/rejected": -6.474618434906006, + "step": 3170 + }, + { + "epoch": 1.64, + "learning_rate": 2.5157773952954677e-07, + "logits/chosen": -2.746309757232666, + "logits/rejected": -2.692573070526123, + "logps/chosen": -342.6100158691406, + "logps/rejected": -319.15850830078125, + "loss": 0.1194, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.26776519417762756, + "rewards/margins": 7.538763523101807, + "rewards/rejected": -7.270998477935791, + "step": 3180 + }, + { + "epoch": 1.65, + "learning_rate": 2.506215337540639e-07, + "logits/chosen": -2.842471122741699, + "logits/rejected": -2.7346935272216797, + "logps/chosen": -357.83837890625, + "logps/rejected": -415.0469665527344, + "loss": 0.1034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20541605353355408, + "rewards/margins": 6.793099880218506, + "rewards/rejected": -6.587684631347656, + "step": 3190 + }, + { + "epoch": 1.65, + "learning_rate": 2.4966532797858095e-07, + "logits/chosen": -2.6495633125305176, + "logits/rejected": -2.666757106781006, + "logps/chosen": -266.09454345703125, + "logps/rejected": -267.7814636230469, + "loss": 0.1013, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.0470559298992157, + "rewards/margins": 6.690218925476074, + "rewards/rejected": -6.73727560043335, + "step": 3200 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -2.526689291000366, + "eval_logits/rejected": -2.492635726928711, + "eval_logps/chosen": -269.9497985839844, + "eval_logps/rejected": -296.7825012207031, + "eval_loss": 0.5574991703033447, + "eval_rewards/accuracies": 0.875, + "eval_rewards/chosen": -1.5714715719223022, + "eval_rewards/margins": 2.4097867012023926, + "eval_rewards/rejected": -3.9812583923339844, + "eval_runtime": 57.7657, + "eval_samples_per_second": 17.311, + "eval_steps_per_second": 0.277, + "step": 3200 + }, + { + "epoch": 1.66, + "learning_rate": 2.4870912220309807e-07, + "logits/chosen": -2.625276565551758, + "logits/rejected": -2.73038649559021, + "logps/chosen": -289.6630554199219, + "logps/rejected": -282.751953125, + "loss": 0.0784, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39164501428604126, + "rewards/margins": 5.46439266204834, + "rewards/rejected": -5.072747707366943, + "step": 3210 + }, + { + "epoch": 1.66, + "learning_rate": 2.477529164276152e-07, + "logits/chosen": -2.7541117668151855, + "logits/rejected": -2.75673508644104, + "logps/chosen": -309.00799560546875, + "logps/rejected": -350.14556884765625, + "loss": 0.0784, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0380117893218994, + "rewards/margins": 7.454461574554443, + "rewards/rejected": -6.416450500488281, + "step": 3220 + }, + { + "epoch": 1.67, + "learning_rate": 2.4679671065213235e-07, + "logits/chosen": -2.7203588485717773, + "logits/rejected": -2.704502582550049, + "logps/chosen": -347.71453857421875, + "logps/rejected": -295.966552734375, + "loss": 0.1137, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29694637656211853, + "rewards/margins": 5.83956241607666, + "rewards/rejected": -5.5426154136657715, + "step": 3230 + }, + { + "epoch": 1.67, + "learning_rate": 2.4584050487664947e-07, + "logits/chosen": -2.7662394046783447, + "logits/rejected": -2.7026658058166504, + "logps/chosen": -346.2272644042969, + "logps/rejected": -320.8843078613281, + "loss": 0.0958, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34752047061920166, + "rewards/margins": 6.67000675201416, + "rewards/rejected": -6.32248592376709, + "step": 3240 + }, + { + "epoch": 1.68, + "learning_rate": 2.448842991011666e-07, + "logits/chosen": -2.6715903282165527, + "logits/rejected": -2.603444814682007, + "logps/chosen": -275.488037109375, + "logps/rejected": -369.26861572265625, + "loss": 0.1227, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.22678561508655548, + "rewards/margins": 7.3243255615234375, + "rewards/rejected": -7.09753942489624, + "step": 3250 + }, + { + "epoch": 1.68, + "learning_rate": 2.439280933256837e-07, + "logits/chosen": -2.5521583557128906, + "logits/rejected": -2.575525999069214, + "logps/chosen": -234.0755157470703, + "logps/rejected": -253.0180206298828, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34158411622047424, + "rewards/margins": 7.31561803817749, + "rewards/rejected": -6.974034309387207, + "step": 3260 + }, + { + "epoch": 1.69, + "learning_rate": 2.429718875502008e-07, + "logits/chosen": -2.246537923812866, + "logits/rejected": -2.325873613357544, + "logps/chosen": -278.77386474609375, + "logps/rejected": -295.7586975097656, + "loss": 0.0705, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1356315165758133, + "rewards/margins": 5.905457496643066, + "rewards/rejected": -5.7698259353637695, + "step": 3270 + }, + { + "epoch": 1.69, + "learning_rate": 2.420156817747179e-07, + "logits/chosen": -2.7891759872436523, + "logits/rejected": -2.7579565048217773, + "logps/chosen": -356.2643127441406, + "logps/rejected": -370.2890319824219, + "loss": 0.0637, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.439815878868103, + "rewards/margins": 8.942848205566406, + "rewards/rejected": -7.503033638000488, + "step": 3280 + }, + { + "epoch": 1.7, + "learning_rate": 2.41059475999235e-07, + "logits/chosen": -2.726214647293091, + "logits/rejected": -2.5874438285827637, + "logps/chosen": -226.1343231201172, + "logps/rejected": -244.17489624023438, + "loss": 0.0722, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5386222004890442, + "rewards/margins": 6.369637489318848, + "rewards/rejected": -6.908260345458984, + "step": 3290 + }, + { + "epoch": 1.7, + "learning_rate": 2.4010327022375216e-07, + "logits/chosen": -2.660001516342163, + "logits/rejected": -2.6236727237701416, + "logps/chosen": -350.3585510253906, + "logps/rejected": -423.14605712890625, + "loss": 0.1254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3113314211368561, + "rewards/margins": 8.888386726379395, + "rewards/rejected": -8.577055931091309, + "step": 3300 + }, + { + "epoch": 1.7, + "eval_logits/chosen": -2.533999443054199, + "eval_logits/rejected": -2.5016584396362305, + "eval_logps/chosen": -270.6473083496094, + "eval_logps/rejected": -301.6729736328125, + "eval_loss": 0.5904735326766968, + "eval_rewards/accuracies": 0.859375, + "eval_rewards/chosen": -1.6412229537963867, + "eval_rewards/margins": 2.8290822505950928, + "eval_rewards/rejected": -4.4703049659729, + "eval_runtime": 56.7796, + "eval_samples_per_second": 17.612, + "eval_steps_per_second": 0.282, + "step": 3300 + }, + { + "epoch": 1.71, + "learning_rate": 2.391470644482693e-07, + "logits/chosen": -2.5988898277282715, + "logits/rejected": -2.633589267730713, + "logps/chosen": -299.37860107421875, + "logps/rejected": -375.39788818359375, + "loss": 0.0757, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23076781630516052, + "rewards/margins": 7.872524261474609, + "rewards/rejected": -7.641757011413574, + "step": 3310 + }, + { + "epoch": 1.71, + "learning_rate": 2.3819085867278636e-07, + "logits/chosen": -2.6122288703918457, + "logits/rejected": -2.6097447872161865, + "logps/chosen": -184.86968994140625, + "logps/rejected": -299.4801330566406, + "loss": 0.0826, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3567231297492981, + "rewards/margins": 8.226961135864258, + "rewards/rejected": -8.583684921264648, + "step": 3320 + }, + { + "epoch": 1.72, + "learning_rate": 2.3723465289730348e-07, + "logits/chosen": -2.629166603088379, + "logits/rejected": -2.598412036895752, + "logps/chosen": -287.3348693847656, + "logps/rejected": -273.7483825683594, + "loss": 0.1158, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2611514925956726, + "rewards/margins": 5.725651264190674, + "rewards/rejected": -5.986802577972412, + "step": 3330 + }, + { + "epoch": 1.72, + "learning_rate": 2.362784471218206e-07, + "logits/chosen": -2.73244571685791, + "logits/rejected": -2.7296879291534424, + "logps/chosen": -355.73236083984375, + "logps/rejected": -323.4547424316406, + "loss": 0.0704, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1553064584732056, + "rewards/margins": 6.715522766113281, + "rewards/rejected": -5.560215950012207, + "step": 3340 + }, + { + "epoch": 1.73, + "learning_rate": 2.353222413463377e-07, + "logits/chosen": -2.7055535316467285, + "logits/rejected": -2.659834146499634, + "logps/chosen": -234.71792602539062, + "logps/rejected": -258.9352722167969, + "loss": 0.0594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.04846489429473877, + "rewards/margins": 5.886017799377441, + "rewards/rejected": -5.934482574462891, + "step": 3350 + }, + { + "epoch": 1.73, + "learning_rate": 2.3436603557085483e-07, + "logits/chosen": -2.7515339851379395, + "logits/rejected": -2.647671937942505, + "logps/chosen": -205.7976837158203, + "logps/rejected": -278.0999450683594, + "loss": 0.0886, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5387641787528992, + "rewards/margins": 6.382545471191406, + "rewards/rejected": -5.8437819480896, + "step": 3360 + }, + { + "epoch": 1.74, + "learning_rate": 2.3340982979537197e-07, + "logits/chosen": -2.727328300476074, + "logits/rejected": -2.7680537700653076, + "logps/chosen": -395.30169677734375, + "logps/rejected": -329.90234375, + "loss": 0.0734, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9938201904296875, + "rewards/margins": 6.250397682189941, + "rewards/rejected": -5.256577491760254, + "step": 3370 + }, + { + "epoch": 1.74, + "learning_rate": 2.3245362401988909e-07, + "logits/chosen": -2.7096972465515137, + "logits/rejected": -2.605597734451294, + "logps/chosen": -308.7266845703125, + "logps/rejected": -269.8343811035156, + "loss": 0.0906, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0761408805847168, + "rewards/margins": 7.6199774742126465, + "rewards/rejected": -6.543837547302246, + "step": 3380 + }, + { + "epoch": 1.75, + "learning_rate": 2.314974182444062e-07, + "logits/chosen": -2.7419021129608154, + "logits/rejected": -2.797194004058838, + "logps/chosen": -308.60302734375, + "logps/rejected": -293.39581298828125, + "loss": 0.1425, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2203086614608765, + "rewards/margins": 5.09138822555542, + "rewards/rejected": -6.311697483062744, + "step": 3390 + }, + { + "epoch": 1.76, + "learning_rate": 2.305412124689233e-07, + "logits/chosen": -2.521355628967285, + "logits/rejected": -2.5652852058410645, + "logps/chosen": -271.5760192871094, + "logps/rejected": -332.80743408203125, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12037495523691177, + "rewards/margins": 6.457161903381348, + "rewards/rejected": -6.33678674697876, + "step": 3400 + }, + { + "epoch": 1.76, + "eval_logits/chosen": -2.495957612991333, + "eval_logits/rejected": -2.461439609527588, + "eval_logps/chosen": -273.39410400390625, + "eval_logps/rejected": -303.7296142578125, + "eval_loss": 0.6132888793945312, + "eval_rewards/accuracies": 0.84375, + "eval_rewards/chosen": -1.9158999919891357, + "eval_rewards/margins": 2.760065793991089, + "eval_rewards/rejected": -4.675965309143066, + "eval_runtime": 57.5942, + "eval_samples_per_second": 17.363, + "eval_steps_per_second": 0.278, + "step": 3400 + }, + { + "epoch": 1.76, + "learning_rate": 2.295850066934404e-07, + "logits/chosen": -2.6166062355041504, + "logits/rejected": -2.540011167526245, + "logps/chosen": -279.5812683105469, + "logps/rejected": -327.165283203125, + "loss": 0.0789, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3240896463394165, + "rewards/margins": 6.653228759765625, + "rewards/rejected": -7.977317810058594, + "step": 3410 + }, + { + "epoch": 1.77, + "learning_rate": 2.2862880091795752e-07, + "logits/chosen": -2.5269017219543457, + "logits/rejected": -2.516174077987671, + "logps/chosen": -194.35435485839844, + "logps/rejected": -307.7319030761719, + "loss": 0.0863, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5239468812942505, + "rewards/margins": 5.74463415145874, + "rewards/rejected": -6.268580436706543, + "step": 3420 + }, + { + "epoch": 1.77, + "learning_rate": 2.2767259514247464e-07, + "logits/chosen": -2.3292429447174072, + "logits/rejected": -2.2449238300323486, + "logps/chosen": -235.97329711914062, + "logps/rejected": -297.90130615234375, + "loss": 0.0867, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7022100687026978, + "rewards/margins": 6.978515625, + "rewards/rejected": -6.27630615234375, + "step": 3430 + }, + { + "epoch": 1.78, + "learning_rate": 2.2671638936699178e-07, + "logits/chosen": -2.413973093032837, + "logits/rejected": -2.389719247817993, + "logps/chosen": -315.0927734375, + "logps/rejected": -248.7796630859375, + "loss": 0.0776, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.12068784236907959, + "rewards/margins": 5.998594760894775, + "rewards/rejected": -5.877906799316406, + "step": 3440 + }, + { + "epoch": 1.78, + "learning_rate": 2.257601835915089e-07, + "logits/chosen": -2.582331895828247, + "logits/rejected": -2.504185199737549, + "logps/chosen": -333.48358154296875, + "logps/rejected": -384.9881286621094, + "loss": 0.0597, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.09340760856866837, + "rewards/margins": 7.274853706359863, + "rewards/rejected": -7.181446075439453, + "step": 3450 + }, + { + "epoch": 1.79, + "learning_rate": 2.24803977816026e-07, + "logits/chosen": -2.3808603286743164, + "logits/rejected": -2.5695574283599854, + "logps/chosen": -218.94461059570312, + "logps/rejected": -255.54013061523438, + "loss": 0.1264, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.38311663269996643, + "rewards/margins": 8.536532402038574, + "rewards/rejected": -8.153416633605957, + "step": 3460 + }, + { + "epoch": 1.79, + "learning_rate": 2.2384777204054313e-07, + "logits/chosen": -2.7166881561279297, + "logits/rejected": -2.651099920272827, + "logps/chosen": -266.19390869140625, + "logps/rejected": -290.5924377441406, + "loss": 0.0907, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5935767292976379, + "rewards/margins": 7.180167198181152, + "rewards/rejected": -6.586589813232422, + "step": 3470 + }, + { + "epoch": 1.8, + "learning_rate": 2.2289156626506022e-07, + "logits/chosen": -2.6081440448760986, + "logits/rejected": -2.435035228729248, + "logps/chosen": -307.9920959472656, + "logps/rejected": -380.0340270996094, + "loss": 0.081, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4312973916530609, + "rewards/margins": 8.036073684692383, + "rewards/rejected": -7.604775428771973, + "step": 3480 + }, + { + "epoch": 1.8, + "learning_rate": 2.2193536048957733e-07, + "logits/chosen": -2.6976230144500732, + "logits/rejected": -2.664168119430542, + "logps/chosen": -258.38092041015625, + "logps/rejected": -316.06903076171875, + "loss": 0.1032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.064023457467556, + "rewards/margins": 6.762887477874756, + "rewards/rejected": -6.698863983154297, + "step": 3490 + }, + { + "epoch": 1.81, + "learning_rate": 2.2097915471409445e-07, + "logits/chosen": -2.7331182956695557, + "logits/rejected": -2.7065072059631348, + "logps/chosen": -242.06661987304688, + "logps/rejected": -319.9682312011719, + "loss": 0.065, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.17091651260852814, + "rewards/margins": 7.219882011413574, + "rewards/rejected": -7.0489654541015625, + "step": 3500 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -2.5004467964172363, + "eval_logits/rejected": -2.4597153663635254, + "eval_logps/chosen": -272.472412109375, + "eval_logps/rejected": -300.49505615234375, + "eval_loss": 0.607377290725708, + "eval_rewards/accuracies": 0.859375, + "eval_rewards/chosen": -1.8237330913543701, + "eval_rewards/margins": 2.528778553009033, + "eval_rewards/rejected": -4.352511405944824, + "eval_runtime": 58.0784, + "eval_samples_per_second": 17.218, + "eval_steps_per_second": 0.275, + "step": 3500 + }, + { + "epoch": 1.81, + "learning_rate": 2.200229489386116e-07, + "logits/chosen": -2.7414891719818115, + "logits/rejected": -2.6085870265960693, + "logps/chosen": -311.56866455078125, + "logps/rejected": -367.4567565917969, + "loss": 0.0892, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9936217069625854, + "rewards/margins": 6.746335029602051, + "rewards/rejected": -5.752713680267334, + "step": 3510 + }, + { + "epoch": 1.82, + "learning_rate": 2.190667431631287e-07, + "logits/chosen": -2.767604112625122, + "logits/rejected": -2.6218278408050537, + "logps/chosen": -354.67822265625, + "logps/rejected": -397.64068603515625, + "loss": 0.1602, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2055106163024902, + "rewards/margins": 9.150163650512695, + "rewards/rejected": -7.944652557373047, + "step": 3520 + }, + { + "epoch": 1.82, + "learning_rate": 2.1811053738764582e-07, + "logits/chosen": -2.4568405151367188, + "logits/rejected": -2.4575486183166504, + "logps/chosen": -235.7547149658203, + "logps/rejected": -265.8733215332031, + "loss": 0.0686, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.24134965240955353, + "rewards/margins": 4.935102939605713, + "rewards/rejected": -5.17645263671875, + "step": 3530 + }, + { + "epoch": 1.83, + "learning_rate": 2.1715433161216294e-07, + "logits/chosen": -2.7147650718688965, + "logits/rejected": -2.6502747535705566, + "logps/chosen": -291.44219970703125, + "logps/rejected": -360.6312255859375, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0583919286727905, + "rewards/margins": 7.564295768737793, + "rewards/rejected": -6.5059027671813965, + "step": 3540 + }, + { + "epoch": 1.83, + "learning_rate": 2.1619812583668005e-07, + "logits/chosen": -2.5987043380737305, + "logits/rejected": -2.607950448989868, + "logps/chosen": -299.592529296875, + "logps/rejected": -337.1802978515625, + "loss": 0.0877, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.13366484642028809, + "rewards/margins": 6.168055534362793, + "rewards/rejected": -6.034390926361084, + "step": 3550 + }, + { + "epoch": 1.84, + "learning_rate": 2.1524192006119714e-07, + "logits/chosen": -2.549741744995117, + "logits/rejected": -2.519808292388916, + "logps/chosen": -290.70684814453125, + "logps/rejected": -333.82489013671875, + "loss": 0.0653, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6356315612792969, + "rewards/margins": 6.749837398529053, + "rewards/rejected": -7.38546895980835, + "step": 3560 + }, + { + "epoch": 1.84, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": -2.5867228507995605, + "logits/rejected": -2.5592923164367676, + "logps/chosen": -248.76699829101562, + "logps/rejected": -269.9541015625, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.503594696521759, + "rewards/margins": 5.626603126525879, + "rewards/rejected": -6.130197525024414, + "step": 3570 + }, + { + "epoch": 1.85, + "learning_rate": 2.133295085102314e-07, + "logits/chosen": -2.4805967807769775, + "logits/rejected": -2.5831592082977295, + "logps/chosen": -288.694580078125, + "logps/rejected": -289.60638427734375, + "loss": 0.0924, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.20494358241558075, + "rewards/margins": 6.796807289123535, + "rewards/rejected": -7.0017499923706055, + "step": 3580 + }, + { + "epoch": 1.85, + "learning_rate": 2.1237330273474851e-07, + "logits/chosen": -2.6966331005096436, + "logits/rejected": -2.650146245956421, + "logps/chosen": -374.99774169921875, + "logps/rejected": -346.72711181640625, + "loss": 0.1106, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8571535348892212, + "rewards/margins": 7.465939521789551, + "rewards/rejected": -6.608786106109619, + "step": 3590 + }, + { + "epoch": 1.86, + "learning_rate": 2.1141709695926563e-07, + "logits/chosen": -2.73488450050354, + "logits/rejected": -2.7135844230651855, + "logps/chosen": -230.2847137451172, + "logps/rejected": -190.71505737304688, + "loss": 0.0755, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4617141783237457, + "rewards/margins": 5.763091087341309, + "rewards/rejected": -6.224804878234863, + "step": 3600 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.4716105461120605, + "eval_logits/rejected": -2.4327313899993896, + "eval_logps/chosen": -273.4872131347656, + "eval_logps/rejected": -300.97479248046875, + "eval_loss": 0.5835925340652466, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -1.9252128601074219, + "eval_rewards/margins": 2.4752719402313232, + "eval_rewards/rejected": -4.400485038757324, + "eval_runtime": 60.2598, + "eval_samples_per_second": 16.595, + "eval_steps_per_second": 0.266, + "step": 3600 + }, + { + "epoch": 1.86, + "learning_rate": 2.1046089118378275e-07, + "logits/chosen": -2.7524561882019043, + "logits/rejected": -2.7061877250671387, + "logps/chosen": -265.36962890625, + "logps/rejected": -293.2806396484375, + "loss": 0.1317, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8292659521102905, + "rewards/margins": 7.344795227050781, + "rewards/rejected": -6.515528678894043, + "step": 3610 + }, + { + "epoch": 1.87, + "learning_rate": 2.0950468540829986e-07, + "logits/chosen": -2.6626524925231934, + "logits/rejected": -2.640347719192505, + "logps/chosen": -207.5610809326172, + "logps/rejected": -238.7421417236328, + "loss": 0.097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3047277331352234, + "rewards/margins": 6.0444464683532715, + "rewards/rejected": -6.349174499511719, + "step": 3620 + }, + { + "epoch": 1.87, + "learning_rate": 2.0854847963281698e-07, + "logits/chosen": -2.450810194015503, + "logits/rejected": -2.3714869022369385, + "logps/chosen": -290.0536804199219, + "logps/rejected": -285.4010009765625, + "loss": 0.076, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6087032556533813, + "rewards/margins": 5.092909336090088, + "rewards/rejected": -5.70161247253418, + "step": 3630 + }, + { + "epoch": 1.88, + "learning_rate": 2.0759227385733407e-07, + "logits/chosen": -2.564415216445923, + "logits/rejected": -2.6595184803009033, + "logps/chosen": -372.54949951171875, + "logps/rejected": -315.68438720703125, + "loss": 0.1015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47370368242263794, + "rewards/margins": 6.991959571838379, + "rewards/rejected": -7.465662479400635, + "step": 3640 + }, + { + "epoch": 1.88, + "learning_rate": 2.066360680818512e-07, + "logits/chosen": -2.651179552078247, + "logits/rejected": -2.6251769065856934, + "logps/chosen": -375.2741394042969, + "logps/rejected": -317.2344055175781, + "loss": 0.1108, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11416218429803848, + "rewards/margins": 7.2585554122924805, + "rewards/rejected": -7.144394874572754, + "step": 3650 + }, + { + "epoch": 1.89, + "learning_rate": 2.0567986230636832e-07, + "logits/chosen": -2.5170671939849854, + "logits/rejected": -2.639958620071411, + "logps/chosen": -219.71676635742188, + "logps/rejected": -264.04632568359375, + "loss": 0.0927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3074165880680084, + "rewards/margins": 5.667797565460205, + "rewards/rejected": -5.975214004516602, + "step": 3660 + }, + { + "epoch": 1.89, + "learning_rate": 2.0472365653088544e-07, + "logits/chosen": -2.646237850189209, + "logits/rejected": -2.712930679321289, + "logps/chosen": -297.159423828125, + "logps/rejected": -336.8759765625, + "loss": 0.0811, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.833897590637207, + "rewards/margins": 6.053628444671631, + "rewards/rejected": -6.887526035308838, + "step": 3670 + }, + { + "epoch": 1.9, + "learning_rate": 2.0376745075540256e-07, + "logits/chosen": -2.8176498413085938, + "logits/rejected": -2.798159122467041, + "logps/chosen": -279.7525634765625, + "logps/rejected": -284.43316650390625, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006855732295662165, + "rewards/margins": 6.67547082901001, + "rewards/rejected": -6.682325839996338, + "step": 3680 + }, + { + "epoch": 1.91, + "learning_rate": 2.0281124497991967e-07, + "logits/chosen": -2.7028536796569824, + "logits/rejected": -2.6612937450408936, + "logps/chosen": -252.33505249023438, + "logps/rejected": -403.2816467285156, + "loss": 0.0998, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05786427855491638, + "rewards/margins": 7.661648750305176, + "rewards/rejected": -7.603785037994385, + "step": 3690 + }, + { + "epoch": 1.91, + "learning_rate": 2.018550392044368e-07, + "logits/chosen": -2.7595086097717285, + "logits/rejected": -2.681696653366089, + "logps/chosen": -295.5634460449219, + "logps/rejected": -494.0884704589844, + "loss": 0.0746, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.988511860370636, + "rewards/margins": 10.416128158569336, + "rewards/rejected": -9.427616119384766, + "step": 3700 + }, + { + "epoch": 1.91, + "eval_logits/chosen": -2.5114712715148926, + "eval_logits/rejected": -2.468604564666748, + "eval_logps/chosen": -273.5149230957031, + "eval_logps/rejected": -301.87615966796875, + "eval_loss": 0.5788707137107849, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -1.9279824495315552, + "eval_rewards/margins": 2.562638521194458, + "eval_rewards/rejected": -4.4906206130981445, + "eval_runtime": 56.2772, + "eval_samples_per_second": 17.769, + "eval_steps_per_second": 0.284, + "step": 3700 + }, + { + "epoch": 1.92, + "learning_rate": 2.0089883342895388e-07, + "logits/chosen": -2.7059268951416016, + "logits/rejected": -2.753756523132324, + "logps/chosen": -202.04066467285156, + "logps/rejected": -245.59237670898438, + "loss": 0.075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1704142987728119, + "rewards/margins": 6.434650421142578, + "rewards/rejected": -6.605063438415527, + "step": 3710 + }, + { + "epoch": 1.92, + "learning_rate": 1.9994262765347102e-07, + "logits/chosen": -2.642674207687378, + "logits/rejected": -2.5932514667510986, + "logps/chosen": -399.32305908203125, + "logps/rejected": -326.49798583984375, + "loss": 0.134, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.060870956629514694, + "rewards/margins": 6.860370635986328, + "rewards/rejected": -6.921241760253906, + "step": 3720 + }, + { + "epoch": 1.93, + "learning_rate": 1.9898642187798813e-07, + "logits/chosen": -2.6123080253601074, + "logits/rejected": -2.7516627311706543, + "logps/chosen": -410.9776306152344, + "logps/rejected": -326.8647155761719, + "loss": 0.0628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07760889828205109, + "rewards/margins": 6.036097049713135, + "rewards/rejected": -6.113706111907959, + "step": 3730 + }, + { + "epoch": 1.93, + "learning_rate": 1.9803021610250525e-07, + "logits/chosen": -2.641099452972412, + "logits/rejected": -2.711040735244751, + "logps/chosen": -216.26535034179688, + "logps/rejected": -274.23516845703125, + "loss": 0.0742, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5522519946098328, + "rewards/margins": 6.619080543518066, + "rewards/rejected": -6.06682825088501, + "step": 3740 + }, + { + "epoch": 1.94, + "learning_rate": 1.9707401032702237e-07, + "logits/chosen": -2.6930148601531982, + "logits/rejected": -2.691132068634033, + "logps/chosen": -269.2910461425781, + "logps/rejected": -311.1435241699219, + "loss": 0.0593, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5175756216049194, + "rewards/margins": 6.387923240661621, + "rewards/rejected": -5.870347023010254, + "step": 3750 + }, + { + "epoch": 1.94, + "learning_rate": 1.9611780455153948e-07, + "logits/chosen": -2.7549490928649902, + "logits/rejected": -2.7406229972839355, + "logps/chosen": -292.29833984375, + "logps/rejected": -254.7724609375, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20989219844341278, + "rewards/margins": 5.599099159240723, + "rewards/rejected": -5.389206886291504, + "step": 3760 + }, + { + "epoch": 1.95, + "learning_rate": 1.951615987760566e-07, + "logits/chosen": -2.5066380500793457, + "logits/rejected": -2.4894328117370605, + "logps/chosen": -221.491455078125, + "logps/rejected": -262.5354309082031, + "loss": 0.072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48722711205482483, + "rewards/margins": 4.931153297424316, + "rewards/rejected": -5.418381214141846, + "step": 3770 + }, + { + "epoch": 1.95, + "learning_rate": 1.942053930005737e-07, + "logits/chosen": -2.8080992698669434, + "logits/rejected": -2.69472074508667, + "logps/chosen": -234.15390014648438, + "logps/rejected": -300.17291259765625, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7450860142707825, + "rewards/margins": 7.135354518890381, + "rewards/rejected": -7.880439758300781, + "step": 3780 + }, + { + "epoch": 1.96, + "learning_rate": 1.9324918722509086e-07, + "logits/chosen": -2.763511896133423, + "logits/rejected": -2.758317708969116, + "logps/chosen": -267.06695556640625, + "logps/rejected": -251.7860107421875, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5050710439682007, + "rewards/margins": 6.807704925537109, + "rewards/rejected": -7.3127760887146, + "step": 3790 + }, + { + "epoch": 1.96, + "learning_rate": 1.9229298144960794e-07, + "logits/chosen": -2.6397032737731934, + "logits/rejected": -2.6277005672454834, + "logps/chosen": -230.0516357421875, + "logps/rejected": -253.93594360351562, + "loss": 0.1348, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.08455387502908707, + "rewards/margins": 5.982255458831787, + "rewards/rejected": -6.06680965423584, + "step": 3800 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.5393259525299072, + "eval_logits/rejected": -2.494310140609741, + "eval_logps/chosen": -272.8935546875, + "eval_logps/rejected": -299.39764404296875, + "eval_loss": 0.6015481352806091, + "eval_rewards/accuracies": 0.828125, + "eval_rewards/chosen": -1.865846872329712, + "eval_rewards/margins": 2.376923084259033, + "eval_rewards/rejected": -4.242770195007324, + "eval_runtime": 57.6051, + "eval_samples_per_second": 17.36, + "eval_steps_per_second": 0.278, + "step": 3800 + }, + { + "epoch": 1.97, + "learning_rate": 1.9133677567412506e-07, + "logits/chosen": -2.790476083755493, + "logits/rejected": -2.786289691925049, + "logps/chosen": -319.78619384765625, + "logps/rejected": -282.0672607421875, + "loss": 0.0818, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.11253446340560913, + "rewards/margins": 6.30682897567749, + "rewards/rejected": -6.419363498687744, + "step": 3810 + }, + { + "epoch": 1.97, + "learning_rate": 1.9038056989864218e-07, + "logits/chosen": -2.7111282348632812, + "logits/rejected": -2.765439033508301, + "logps/chosen": -256.03546142578125, + "logps/rejected": -314.55523681640625, + "loss": 0.1137, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.10970799624919891, + "rewards/margins": 6.709108829498291, + "rewards/rejected": -6.818817138671875, + "step": 3820 + }, + { + "epoch": 1.98, + "learning_rate": 1.894243641231593e-07, + "logits/chosen": -2.5691027641296387, + "logits/rejected": -2.4961977005004883, + "logps/chosen": -272.5830993652344, + "logps/rejected": -250.82357788085938, + "loss": 0.103, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4049804210662842, + "rewards/margins": 5.012188911437988, + "rewards/rejected": -6.417168617248535, + "step": 3830 + }, + { + "epoch": 1.98, + "learning_rate": 1.884681583476764e-07, + "logits/chosen": -2.8165037631988525, + "logits/rejected": -2.76141357421875, + "logps/chosen": -229.1115264892578, + "logps/rejected": -300.12347412109375, + "loss": 0.0621, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.30909544229507446, + "rewards/margins": 5.961316108703613, + "rewards/rejected": -6.270411968231201, + "step": 3840 + }, + { + "epoch": 1.99, + "learning_rate": 1.8751195257219352e-07, + "logits/chosen": -2.683171272277832, + "logits/rejected": -2.74794602394104, + "logps/chosen": -281.92901611328125, + "logps/rejected": -404.372314453125, + "loss": 0.0803, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3100479245185852, + "rewards/margins": 7.182066440582275, + "rewards/rejected": -7.492114067077637, + "step": 3850 + }, + { + "epoch": 1.99, + "learning_rate": 1.8655574679671067e-07, + "logits/chosen": -2.660099506378174, + "logits/rejected": -2.69828724861145, + "logps/chosen": -241.91787719726562, + "logps/rejected": -317.3074951171875, + "loss": 0.0951, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.469612717628479, + "rewards/margins": 7.08514404296875, + "rewards/rejected": -8.554756164550781, + "step": 3860 + }, + { + "epoch": 2.0, + "learning_rate": 1.8559954102122778e-07, + "logits/chosen": -2.550110340118408, + "logits/rejected": -2.5635857582092285, + "logps/chosen": -280.5724792480469, + "logps/rejected": -239.25119018554688, + "loss": 0.1036, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0125322341918945, + "rewards/margins": 4.381348609924316, + "rewards/rejected": -5.393881320953369, + "step": 3870 + }, + { + "epoch": 2.0, + "learning_rate": 1.8464333524574487e-07, + "logits/chosen": -2.7105534076690674, + "logits/rejected": -2.670560598373413, + "logps/chosen": -213.32907104492188, + "logps/rejected": -330.0856628417969, + "loss": 0.0344, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48191890120506287, + "rewards/margins": 7.262728691101074, + "rewards/rejected": -7.744647026062012, + "step": 3880 + }, + { + "epoch": 2.01, + "learning_rate": 1.8368712947026199e-07, + "logits/chosen": -2.8019955158233643, + "logits/rejected": -2.7659356594085693, + "logps/chosen": -305.8590393066406, + "logps/rejected": -306.02325439453125, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42963480949401855, + "rewards/margins": 6.8882341384887695, + "rewards/rejected": -6.458600044250488, + "step": 3890 + }, + { + "epoch": 2.01, + "learning_rate": 1.827309236947791e-07, + "logits/chosen": -2.6406970024108887, + "logits/rejected": -2.6530818939208984, + "logps/chosen": -155.24813842773438, + "logps/rejected": -301.69390869140625, + "loss": 0.0217, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.34457913041114807, + "rewards/margins": 6.758476257324219, + "rewards/rejected": -7.103055000305176, + "step": 3900 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -2.5271873474121094, + "eval_logits/rejected": -2.4840664863586426, + "eval_logps/chosen": -277.5699157714844, + "eval_logps/rejected": -306.1987609863281, + "eval_loss": 0.612151563167572, + "eval_rewards/accuracies": 0.828125, + "eval_rewards/chosen": -2.3334813117980957, + "eval_rewards/margins": 2.589404582977295, + "eval_rewards/rejected": -4.922885894775391, + "eval_runtime": 54.5082, + "eval_samples_per_second": 18.346, + "eval_steps_per_second": 0.294, + "step": 3900 + }, + { + "epoch": 2.02, + "learning_rate": 1.8177471791929622e-07, + "logits/chosen": -2.416943073272705, + "logits/rejected": -2.4777729511260986, + "logps/chosen": -234.59054565429688, + "logps/rejected": -378.48101806640625, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7428705096244812, + "rewards/margins": 10.251193046569824, + "rewards/rejected": -10.994062423706055, + "step": 3910 + }, + { + "epoch": 2.02, + "learning_rate": 1.8081851214381333e-07, + "logits/chosen": -2.6043264865875244, + "logits/rejected": -2.5203278064727783, + "logps/chosen": -263.97882080078125, + "logps/rejected": -393.0724182128906, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05669177696108818, + "rewards/margins": 8.67068099975586, + "rewards/rejected": -8.613988876342773, + "step": 3920 + }, + { + "epoch": 2.03, + "learning_rate": 1.7986230636833047e-07, + "logits/chosen": -2.6340689659118652, + "logits/rejected": -2.6645379066467285, + "logps/chosen": -179.75973510742188, + "logps/rejected": -246.31448364257812, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3356815576553345, + "rewards/margins": 6.356654167175293, + "rewards/rejected": -7.692336082458496, + "step": 3930 + }, + { + "epoch": 2.03, + "learning_rate": 1.789061005928476e-07, + "logits/chosen": -2.812453269958496, + "logits/rejected": -2.752922534942627, + "logps/chosen": -276.16876220703125, + "logps/rejected": -295.46429443359375, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023592447862029076, + "rewards/margins": 7.244173526763916, + "rewards/rejected": -7.2205810546875, + "step": 3940 + }, + { + "epoch": 2.04, + "learning_rate": 1.7794989481736468e-07, + "logits/chosen": -2.6128292083740234, + "logits/rejected": -2.644348382949829, + "logps/chosen": -243.8308563232422, + "logps/rejected": -270.5189514160156, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30936262011528015, + "rewards/margins": 8.135075569152832, + "rewards/rejected": -8.444437026977539, + "step": 3950 + }, + { + "epoch": 2.04, + "learning_rate": 1.769936890418818e-07, + "logits/chosen": -2.5391926765441895, + "logits/rejected": -2.5172486305236816, + "logps/chosen": -303.0284729003906, + "logps/rejected": -303.38739013671875, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04590020328760147, + "rewards/margins": 7.713925361633301, + "rewards/rejected": -7.668023586273193, + "step": 3960 + }, + { + "epoch": 2.05, + "learning_rate": 1.760374832663989e-07, + "logits/chosen": -2.785437822341919, + "logits/rejected": -2.667668104171753, + "logps/chosen": -374.7364807128906, + "logps/rejected": -371.83050537109375, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3061269521713257, + "rewards/margins": 10.878585815429688, + "rewards/rejected": -9.572458267211914, + "step": 3970 + }, + { + "epoch": 2.05, + "learning_rate": 1.7508127749091603e-07, + "logits/chosen": -2.6636507511138916, + "logits/rejected": -2.622056007385254, + "logps/chosen": -272.5489196777344, + "logps/rejected": -296.45025634765625, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6961295008659363, + "rewards/margins": 7.3043532371521, + "rewards/rejected": -8.000483512878418, + "step": 3980 + }, + { + "epoch": 2.06, + "learning_rate": 1.7412507171543314e-07, + "logits/chosen": -2.6111302375793457, + "logits/rejected": -2.7141504287719727, + "logps/chosen": -306.14471435546875, + "logps/rejected": -277.4181213378906, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2339712679386139, + "rewards/margins": 8.951885223388672, + "rewards/rejected": -9.185856819152832, + "step": 3990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7316886593995028e-07, + "logits/chosen": -2.770508289337158, + "logits/rejected": -2.7339038848876953, + "logps/chosen": -301.5724182128906, + "logps/rejected": -351.9184875488281, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5096687078475952, + "rewards/margins": 8.371912956237793, + "rewards/rejected": -8.881582260131836, + "step": 4000 + }, + { + "epoch": 2.07, + "eval_logits/chosen": -2.454496383666992, + "eval_logits/rejected": -2.4104785919189453, + "eval_logps/chosen": -284.124755859375, + "eval_logps/rejected": -317.1334228515625, + "eval_loss": 0.6521932482719421, + "eval_rewards/accuracies": 0.828125, + "eval_rewards/chosen": -2.988966464996338, + "eval_rewards/margins": 3.0273852348327637, + "eval_rewards/rejected": -6.016351222991943, + "eval_runtime": 61.6079, + "eval_samples_per_second": 16.232, + "eval_steps_per_second": 0.26, + "step": 4000 + }, + { + "epoch": 2.07, + "learning_rate": 1.722126601644674e-07, + "logits/chosen": -2.7440953254699707, + "logits/rejected": -2.736643075942993, + "logps/chosen": -323.197998046875, + "logps/rejected": -265.2000732421875, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9906817674636841, + "rewards/margins": 7.5294013023376465, + "rewards/rejected": -8.520084381103516, + "step": 4010 + }, + { + "epoch": 2.08, + "learning_rate": 1.7125645438898452e-07, + "logits/chosen": -2.7173900604248047, + "logits/rejected": -2.676675319671631, + "logps/chosen": -314.3374938964844, + "logps/rejected": -355.52618408203125, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3634423315525055, + "rewards/margins": 9.33686637878418, + "rewards/rejected": -9.700309753417969, + "step": 4020 + }, + { + "epoch": 2.08, + "learning_rate": 1.703002486135016e-07, + "logits/chosen": -2.6162686347961426, + "logits/rejected": -2.603562593460083, + "logps/chosen": -248.433837890625, + "logps/rejected": -305.6585388183594, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43873992562294006, + "rewards/margins": 10.635756492614746, + "rewards/rejected": -11.07449722290039, + "step": 4030 + }, + { + "epoch": 2.09, + "learning_rate": 1.6934404283801872e-07, + "logits/chosen": -2.52081036567688, + "logits/rejected": -2.3590970039367676, + "logps/chosen": -352.1839904785156, + "logps/rejected": -366.2679138183594, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43957453966140747, + "rewards/margins": 11.327077865600586, + "rewards/rejected": -10.887503623962402, + "step": 4040 + }, + { + "epoch": 2.09, + "learning_rate": 1.6838783706253584e-07, + "logits/chosen": -2.640784502029419, + "logits/rejected": -2.524874687194824, + "logps/chosen": -167.76235961914062, + "logps/rejected": -211.56985473632812, + "loss": 0.0156, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1560419499874115, + "rewards/margins": 8.205093383789062, + "rewards/rejected": -8.049051284790039, + "step": 4050 + }, + { + "epoch": 2.1, + "learning_rate": 1.6743163128705295e-07, + "logits/chosen": -2.604750156402588, + "logits/rejected": -2.5685653686523438, + "logps/chosen": -289.0841064453125, + "logps/rejected": -324.72552490234375, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2151424884796143, + "rewards/margins": 8.238618850708008, + "rewards/rejected": -10.453761100769043, + "step": 4060 + }, + { + "epoch": 2.1, + "learning_rate": 1.664754255115701e-07, + "logits/chosen": -2.4264097213745117, + "logits/rejected": -2.375046730041504, + "logps/chosen": -224.1468505859375, + "logps/rejected": -290.3971862792969, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14240024983882904, + "rewards/margins": 9.841353416442871, + "rewards/rejected": -9.698953628540039, + "step": 4070 + }, + { + "epoch": 2.11, + "learning_rate": 1.655192197360872e-07, + "logits/chosen": -2.753242254257202, + "logits/rejected": -2.6922965049743652, + "logps/chosen": -274.47601318359375, + "logps/rejected": -324.0868835449219, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3948326110839844, + "rewards/margins": 8.828141212463379, + "rewards/rejected": -9.22297477722168, + "step": 4080 + }, + { + "epoch": 2.11, + "learning_rate": 1.6456301396060433e-07, + "logits/chosen": -2.554525375366211, + "logits/rejected": -2.6398258209228516, + "logps/chosen": -365.6826477050781, + "logps/rejected": -360.66107177734375, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2733768820762634, + "rewards/margins": 8.528050422668457, + "rewards/rejected": -8.801426887512207, + "step": 4090 + }, + { + "epoch": 2.12, + "learning_rate": 1.6360680818512144e-07, + "logits/chosen": -2.7123589515686035, + "logits/rejected": -2.61602783203125, + "logps/chosen": -368.64544677734375, + "logps/rejected": -432.6624450683594, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.055361270904541, + "rewards/margins": 8.865598678588867, + "rewards/rejected": -9.920958518981934, + "step": 4100 + }, + { + "epoch": 2.12, + "eval_logits/chosen": -2.4698657989501953, + "eval_logits/rejected": -2.4272119998931885, + "eval_logps/chosen": -289.0121154785156, + "eval_logps/rejected": -323.7186584472656, + "eval_loss": 0.692164421081543, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -3.477701187133789, + "eval_rewards/margins": 3.197173595428467, + "eval_rewards/rejected": -6.674875259399414, + "eval_runtime": 57.1311, + "eval_samples_per_second": 17.504, + "eval_steps_per_second": 0.28, + "step": 4100 + }, + { + "epoch": 2.12, + "learning_rate": 1.6265060240963853e-07, + "logits/chosen": -2.6370177268981934, + "logits/rejected": -2.5220537185668945, + "logps/chosen": -334.99066162109375, + "logps/rejected": -290.169189453125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.933539867401123, + "rewards/margins": 8.522821426391602, + "rewards/rejected": -7.589282989501953, + "step": 4110 + }, + { + "epoch": 2.13, + "learning_rate": 1.6169439663415565e-07, + "logits/chosen": -2.6449599266052246, + "logits/rejected": -2.6207022666931152, + "logps/chosen": -269.48529052734375, + "logps/rejected": -324.10418701171875, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8957691192626953, + "rewards/margins": 9.796669960021973, + "rewards/rejected": -10.692439079284668, + "step": 4120 + }, + { + "epoch": 2.13, + "learning_rate": 1.6073819085867276e-07, + "logits/chosen": -2.797229290008545, + "logits/rejected": -2.7991158962249756, + "logps/chosen": -309.7330627441406, + "logps/rejected": -439.6482849121094, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3518083095550537, + "rewards/margins": 11.625936508178711, + "rewards/rejected": -10.274128913879395, + "step": 4130 + }, + { + "epoch": 2.14, + "learning_rate": 1.597819850831899e-07, + "logits/chosen": -2.752419948577881, + "logits/rejected": -2.6186330318450928, + "logps/chosen": -208.90380859375, + "logps/rejected": -247.1297149658203, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2973028421401978, + "rewards/margins": 9.918791770935059, + "rewards/rejected": -8.621490478515625, + "step": 4140 + }, + { + "epoch": 2.14, + "learning_rate": 1.5882577930770702e-07, + "logits/chosen": -2.5983939170837402, + "logits/rejected": -2.551213502883911, + "logps/chosen": -321.56195068359375, + "logps/rejected": -328.3628234863281, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38840678334236145, + "rewards/margins": 10.420036315917969, + "rewards/rejected": -10.03162956237793, + "step": 4150 + }, + { + "epoch": 2.15, + "learning_rate": 1.5786957353222414e-07, + "logits/chosen": -2.5684826374053955, + "logits/rejected": -2.608212471008301, + "logps/chosen": -272.9964599609375, + "logps/rejected": -265.1176452636719, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0111887454986572, + "rewards/margins": 7.318711280822754, + "rewards/rejected": -8.329900741577148, + "step": 4160 + }, + { + "epoch": 2.15, + "learning_rate": 1.5691336775674125e-07, + "logits/chosen": -2.4619576930999756, + "logits/rejected": -2.555619716644287, + "logps/chosen": -268.24859619140625, + "logps/rejected": -298.4876403808594, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17980532348155975, + "rewards/margins": 9.836331367492676, + "rewards/rejected": -10.01613712310791, + "step": 4170 + }, + { + "epoch": 2.16, + "learning_rate": 1.5595716198125837e-07, + "logits/chosen": -2.7515110969543457, + "logits/rejected": -2.7178173065185547, + "logps/chosen": -400.4342346191406, + "logps/rejected": -458.4161682128906, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9931119084358215, + "rewards/margins": 10.737370491027832, + "rewards/rejected": -11.73048210144043, + "step": 4180 + }, + { + "epoch": 2.16, + "learning_rate": 1.5500095620577546e-07, + "logits/chosen": -2.6951303482055664, + "logits/rejected": -2.748305559158325, + "logps/chosen": -240.6981201171875, + "logps/rejected": -257.77752685546875, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5856528282165527, + "rewards/margins": 7.461671352386475, + "rewards/rejected": -9.047324180603027, + "step": 4190 + }, + { + "epoch": 2.17, + "learning_rate": 1.5404475043029257e-07, + "logits/chosen": -2.5231451988220215, + "logits/rejected": -2.5645296573638916, + "logps/chosen": -191.24134826660156, + "logps/rejected": -325.54949951171875, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5546247959136963, + "rewards/margins": 8.656599998474121, + "rewards/rejected": -10.211225509643555, + "step": 4200 + }, + { + "epoch": 2.17, + "eval_logits/chosen": -2.4464974403381348, + "eval_logits/rejected": -2.4046523571014404, + "eval_logps/chosen": -286.6412658691406, + "eval_logps/rejected": -323.7452697753906, + "eval_loss": 0.6993398666381836, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -3.2406165599823, + "eval_rewards/margins": 3.4369187355041504, + "eval_rewards/rejected": -6.677535533905029, + "eval_runtime": 54.7971, + "eval_samples_per_second": 18.249, + "eval_steps_per_second": 0.292, + "step": 4200 + }, + { + "epoch": 2.17, + "learning_rate": 1.5308854465480971e-07, + "logits/chosen": -2.6298282146453857, + "logits/rejected": -2.6375985145568848, + "logps/chosen": -293.63629150390625, + "logps/rejected": -297.9925537109375, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0975180864334106, + "rewards/margins": 8.3226318359375, + "rewards/rejected": -9.420149803161621, + "step": 4210 + }, + { + "epoch": 2.18, + "learning_rate": 1.5213233887932683e-07, + "logits/chosen": -2.5850603580474854, + "logits/rejected": -2.606503963470459, + "logps/chosen": -316.19854736328125, + "logps/rejected": -332.8941955566406, + "loss": 0.0088, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9063823819160461, + "rewards/margins": 7.739400386810303, + "rewards/rejected": -8.645783424377441, + "step": 4220 + }, + { + "epoch": 2.18, + "learning_rate": 1.5117613310384395e-07, + "logits/chosen": -2.5701706409454346, + "logits/rejected": -2.5911612510681152, + "logps/chosen": -269.97894287109375, + "logps/rejected": -319.3363342285156, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6825813055038452, + "rewards/margins": 10.70821762084961, + "rewards/rejected": -11.390798568725586, + "step": 4230 + }, + { + "epoch": 2.19, + "learning_rate": 1.5021992732836106e-07, + "logits/chosen": -2.4653377532958984, + "logits/rejected": -2.5559732913970947, + "logps/chosen": -279.4239196777344, + "logps/rejected": -356.7681884765625, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9881469011306763, + "rewards/margins": 9.201104164123535, + "rewards/rejected": -10.189250946044922, + "step": 4240 + }, + { + "epoch": 2.19, + "learning_rate": 1.4926372155287818e-07, + "logits/chosen": -2.7210116386413574, + "logits/rejected": -2.593418836593628, + "logps/chosen": -228.53121948242188, + "logps/rejected": -271.88787841796875, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9970628619194031, + "rewards/margins": 8.71868896484375, + "rewards/rejected": -9.715751647949219, + "step": 4250 + }, + { + "epoch": 2.2, + "learning_rate": 1.483075157773953e-07, + "logits/chosen": -2.5943050384521484, + "logits/rejected": -2.673746347427368, + "logps/chosen": -251.91336059570312, + "logps/rejected": -270.3241271972656, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.251340627670288, + "rewards/margins": 8.234782218933105, + "rewards/rejected": -9.486123085021973, + "step": 4260 + }, + { + "epoch": 2.2, + "learning_rate": 1.4735131000191238e-07, + "logits/chosen": -2.6009936332702637, + "logits/rejected": -2.607675313949585, + "logps/chosen": -309.9886169433594, + "logps/rejected": -332.39801025390625, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0659501552581787, + "rewards/margins": 9.141355514526367, + "rewards/rejected": -10.207304954528809, + "step": 4270 + }, + { + "epoch": 2.21, + "learning_rate": 1.4639510422642952e-07, + "logits/chosen": -2.7442212104797363, + "logits/rejected": -2.6310532093048096, + "logps/chosen": -345.13616943359375, + "logps/rejected": -405.30755615234375, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7909820079803467, + "rewards/margins": 9.60711669921875, + "rewards/rejected": -10.398099899291992, + "step": 4280 + }, + { + "epoch": 2.21, + "learning_rate": 1.4543889845094664e-07, + "logits/chosen": -2.4257078170776367, + "logits/rejected": -2.461683750152588, + "logps/chosen": -375.21478271484375, + "logps/rejected": -433.16973876953125, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8098801374435425, + "rewards/margins": 11.403145790100098, + "rewards/rejected": -12.21302604675293, + "step": 4290 + }, + { + "epoch": 2.22, + "learning_rate": 1.4448269267546376e-07, + "logits/chosen": -2.7228384017944336, + "logits/rejected": -2.763788938522339, + "logps/chosen": -330.9010314941406, + "logps/rejected": -367.5445861816406, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8657931089401245, + "rewards/margins": 10.422277450561523, + "rewards/rejected": -9.55648422241211, + "step": 4300 + }, + { + "epoch": 2.22, + "eval_logits/chosen": -2.428981304168701, + "eval_logits/rejected": -2.3842594623565674, + "eval_logps/chosen": -292.2260437011719, + "eval_logps/rejected": -331.3666687011719, + "eval_loss": 0.7177846431732178, + "eval_rewards/accuracies": 0.765625, + "eval_rewards/chosen": -3.7990951538085938, + "eval_rewards/margins": 3.6405770778656006, + "eval_rewards/rejected": -7.439671993255615, + "eval_runtime": 57.5668, + "eval_samples_per_second": 17.371, + "eval_steps_per_second": 0.278, + "step": 4300 + }, + { + "epoch": 2.23, + "learning_rate": 1.4352648689998087e-07, + "logits/chosen": -2.6788887977600098, + "logits/rejected": -2.659087657928467, + "logps/chosen": -255.2762908935547, + "logps/rejected": -230.3298797607422, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5762121081352234, + "rewards/margins": 9.638313293457031, + "rewards/rejected": -10.21452522277832, + "step": 4310 + }, + { + "epoch": 2.23, + "learning_rate": 1.42570281124498e-07, + "logits/chosen": -2.5874216556549072, + "logits/rejected": -2.647291898727417, + "logps/chosen": -264.53802490234375, + "logps/rejected": -368.0313415527344, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9759609699249268, + "rewards/margins": 9.195481300354004, + "rewards/rejected": -11.171442031860352, + "step": 4320 + }, + { + "epoch": 2.24, + "learning_rate": 1.416140753490151e-07, + "logits/chosen": -2.6484475135803223, + "logits/rejected": -2.7253453731536865, + "logps/chosen": -338.7431640625, + "logps/rejected": -423.6756896972656, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1497945338487625, + "rewards/margins": 11.79082202911377, + "rewards/rejected": -11.940614700317383, + "step": 4330 + }, + { + "epoch": 2.24, + "learning_rate": 1.4065786957353222e-07, + "logits/chosen": -2.5038111209869385, + "logits/rejected": -2.5019071102142334, + "logps/chosen": -315.7591552734375, + "logps/rejected": -336.18963623046875, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016095232218503952, + "rewards/margins": 8.441411018371582, + "rewards/rejected": -8.425315856933594, + "step": 4340 + }, + { + "epoch": 2.25, + "learning_rate": 1.3970166379804933e-07, + "logits/chosen": -2.49928879737854, + "logits/rejected": -2.376461982727051, + "logps/chosen": -380.0243225097656, + "logps/rejected": -395.73077392578125, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1554236114025116, + "rewards/margins": 9.892860412597656, + "rewards/rejected": -10.048284530639648, + "step": 4350 + }, + { + "epoch": 2.25, + "learning_rate": 1.3874545802256645e-07, + "logits/chosen": -2.758044481277466, + "logits/rejected": -2.6601271629333496, + "logps/chosen": -311.03436279296875, + "logps/rejected": -419.60418701171875, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6286399364471436, + "rewards/margins": 10.120224952697754, + "rewards/rejected": -11.748865127563477, + "step": 4360 + }, + { + "epoch": 2.26, + "learning_rate": 1.3778925224708357e-07, + "logits/chosen": -2.516096830368042, + "logits/rejected": -2.5368704795837402, + "logps/chosen": -253.93722534179688, + "logps/rejected": -275.40423583984375, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05337781831622124, + "rewards/margins": 8.323257446289062, + "rewards/rejected": -8.376635551452637, + "step": 4370 + }, + { + "epoch": 2.26, + "learning_rate": 1.3683304647160068e-07, + "logits/chosen": -2.6350722312927246, + "logits/rejected": -2.5284571647644043, + "logps/chosen": -279.087158203125, + "logps/rejected": -357.74542236328125, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0737884044647217, + "rewards/margins": 10.794805526733398, + "rewards/rejected": -11.868593215942383, + "step": 4380 + }, + { + "epoch": 2.27, + "learning_rate": 1.358768406961178e-07, + "logits/chosen": -2.6859638690948486, + "logits/rejected": -2.735161066055298, + "logps/chosen": -295.9905700683594, + "logps/rejected": -444.8924255371094, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7544025778770447, + "rewards/margins": 10.69865608215332, + "rewards/rejected": -9.944252967834473, + "step": 4390 + }, + { + "epoch": 2.27, + "learning_rate": 1.349206349206349e-07, + "logits/chosen": -2.757059335708618, + "logits/rejected": -2.771275520324707, + "logps/chosen": -266.63800048828125, + "logps/rejected": -281.4782409667969, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0992039442062378, + "rewards/margins": 8.593305587768555, + "rewards/rejected": -9.692508697509766, + "step": 4400 + }, + { + "epoch": 2.27, + "eval_logits/chosen": -2.4535796642303467, + "eval_logits/rejected": -2.4095299243927, + "eval_logps/chosen": -287.504150390625, + "eval_logps/rejected": -324.9907531738281, + "eval_loss": 0.6839932203292847, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -3.326904535293579, + "eval_rewards/margins": 3.47517728805542, + "eval_rewards/rejected": -6.802082061767578, + "eval_runtime": 58.0489, + "eval_samples_per_second": 17.227, + "eval_steps_per_second": 0.276, + "step": 4400 + }, + { + "epoch": 2.28, + "learning_rate": 1.3396442914515203e-07, + "logits/chosen": -2.330714464187622, + "logits/rejected": -2.469642400741577, + "logps/chosen": -260.82843017578125, + "logps/rejected": -299.21343994140625, + "loss": 0.0115, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7311785221099854, + "rewards/margins": 8.269608497619629, + "rewards/rejected": -10.000787734985352, + "step": 4410 + }, + { + "epoch": 2.28, + "learning_rate": 1.3300822336966917e-07, + "logits/chosen": -2.5285234451293945, + "logits/rejected": -2.3487613201141357, + "logps/chosen": -334.66229248046875, + "logps/rejected": -329.3540954589844, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.163953959941864, + "rewards/margins": 9.816844940185547, + "rewards/rejected": -9.652891159057617, + "step": 4420 + }, + { + "epoch": 2.29, + "learning_rate": 1.3205201759418626e-07, + "logits/chosen": -2.400176525115967, + "logits/rejected": -2.173835277557373, + "logps/chosen": -355.26043701171875, + "logps/rejected": -349.78851318359375, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45656710863113403, + "rewards/margins": 10.023509979248047, + "rewards/rejected": -9.56694221496582, + "step": 4430 + }, + { + "epoch": 2.29, + "learning_rate": 1.3109581181870338e-07, + "logits/chosen": -2.611816883087158, + "logits/rejected": -2.6642374992370605, + "logps/chosen": -294.43756103515625, + "logps/rejected": -321.86846923828125, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08868559449911118, + "rewards/margins": 9.15350341796875, + "rewards/rejected": -9.064818382263184, + "step": 4440 + }, + { + "epoch": 2.3, + "learning_rate": 1.301396060432205e-07, + "logits/chosen": -2.6899092197418213, + "logits/rejected": -2.6209728717803955, + "logps/chosen": -340.12030029296875, + "logps/rejected": -341.85638427734375, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5528199672698975, + "rewards/margins": 10.257894515991211, + "rewards/rejected": -10.810712814331055, + "step": 4450 + }, + { + "epoch": 2.3, + "learning_rate": 1.291834002677376e-07, + "logits/chosen": -2.5746819972991943, + "logits/rejected": -2.4712207317352295, + "logps/chosen": -340.21661376953125, + "logps/rejected": -348.29376220703125, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.839795470237732, + "rewards/margins": 9.294793128967285, + "rewards/rejected": -11.134590148925781, + "step": 4460 + }, + { + "epoch": 2.31, + "learning_rate": 1.2822719449225472e-07, + "logits/chosen": -2.4497411251068115, + "logits/rejected": -2.6023406982421875, + "logps/chosen": -258.5740966796875, + "logps/rejected": -322.1835021972656, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4901916980743408, + "rewards/margins": 8.80390739440918, + "rewards/rejected": -10.294098854064941, + "step": 4470 + }, + { + "epoch": 2.31, + "learning_rate": 1.2727098871677184e-07, + "logits/chosen": -2.679898262023926, + "logits/rejected": -2.6797006130218506, + "logps/chosen": -358.4029235839844, + "logps/rejected": -349.04119873046875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31357401609420776, + "rewards/margins": 9.342456817626953, + "rewards/rejected": -9.656030654907227, + "step": 4480 + }, + { + "epoch": 2.32, + "learning_rate": 1.2631478294128898e-07, + "logits/chosen": -2.6513264179229736, + "logits/rejected": -2.6451632976531982, + "logps/chosen": -398.11871337890625, + "logps/rejected": -359.9664611816406, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07513128221035004, + "rewards/margins": 10.039201736450195, + "rewards/rejected": -9.964070320129395, + "step": 4490 + }, + { + "epoch": 2.32, + "learning_rate": 1.253585771658061e-07, + "logits/chosen": -2.587759017944336, + "logits/rejected": -2.633078098297119, + "logps/chosen": -251.6234588623047, + "logps/rejected": -377.10443115234375, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1120997667312622, + "rewards/margins": 9.92921257019043, + "rewards/rejected": -11.041314125061035, + "step": 4500 + }, + { + "epoch": 2.32, + "eval_logits/chosen": -2.4542932510375977, + "eval_logits/rejected": -2.411810874938965, + "eval_logps/chosen": -291.1250305175781, + "eval_logps/rejected": -329.98406982421875, + "eval_loss": 0.7013015151023865, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -3.688992738723755, + "eval_rewards/margins": 3.612422466278076, + "eval_rewards/rejected": -7.301414966583252, + "eval_runtime": 56.7399, + "eval_samples_per_second": 17.624, + "eval_steps_per_second": 0.282, + "step": 4500 + }, + { + "epoch": 2.33, + "learning_rate": 1.2440237139032319e-07, + "logits/chosen": -2.7155184745788574, + "logits/rejected": -2.7012360095977783, + "logps/chosen": -270.6969909667969, + "logps/rejected": -277.15362548828125, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8242677450180054, + "rewards/margins": 10.182249069213867, + "rewards/rejected": -12.006516456604004, + "step": 4510 + }, + { + "epoch": 2.33, + "learning_rate": 1.234461656148403e-07, + "logits/chosen": -2.7778592109680176, + "logits/rejected": -2.6845195293426514, + "logps/chosen": -406.66497802734375, + "logps/rejected": -398.89044189453125, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5758999586105347, + "rewards/margins": 11.570829391479492, + "rewards/rejected": -13.146730422973633, + "step": 4520 + }, + { + "epoch": 2.34, + "learning_rate": 1.2248995983935742e-07, + "logits/chosen": -2.7265734672546387, + "logits/rejected": -2.6226305961608887, + "logps/chosen": -284.33843994140625, + "logps/rejected": -360.93121337890625, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6043723821640015, + "rewards/margins": 10.222024917602539, + "rewards/rejected": -10.826397895812988, + "step": 4530 + }, + { + "epoch": 2.34, + "learning_rate": 1.2153375406387456e-07, + "logits/chosen": -2.724083185195923, + "logits/rejected": -2.75142765045166, + "logps/chosen": -355.35504150390625, + "logps/rejected": -472.2686462402344, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3634461164474487, + "rewards/margins": 11.717732429504395, + "rewards/rejected": -13.081178665161133, + "step": 4540 + }, + { + "epoch": 2.35, + "learning_rate": 1.2057754828839165e-07, + "logits/chosen": -2.666905641555786, + "logits/rejected": -2.737536907196045, + "logps/chosen": -310.6121520996094, + "logps/rejected": -350.0155029296875, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12724515795707703, + "rewards/margins": 9.271829605102539, + "rewards/rejected": -9.39907455444336, + "step": 4550 + }, + { + "epoch": 2.35, + "learning_rate": 1.1962134251290876e-07, + "logits/chosen": -2.6463513374328613, + "logits/rejected": -2.6242516040802, + "logps/chosen": -268.8026123046875, + "logps/rejected": -253.2088623046875, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7786948680877686, + "rewards/margins": 8.877888679504395, + "rewards/rejected": -10.656583786010742, + "step": 4560 + }, + { + "epoch": 2.36, + "learning_rate": 1.1866513673742588e-07, + "logits/chosen": -2.6526236534118652, + "logits/rejected": -2.5933640003204346, + "logps/chosen": -244.880615234375, + "logps/rejected": -330.068603515625, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4325166940689087, + "rewards/margins": 8.305582046508789, + "rewards/rejected": -9.73809814453125, + "step": 4570 + }, + { + "epoch": 2.36, + "learning_rate": 1.1770893096194301e-07, + "logits/chosen": -2.6464786529541016, + "logits/rejected": -2.621084451675415, + "logps/chosen": -358.1322326660156, + "logps/rejected": -398.2645568847656, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9040740728378296, + "rewards/margins": 10.22703742980957, + "rewards/rejected": -12.131113052368164, + "step": 4580 + }, + { + "epoch": 2.37, + "learning_rate": 1.1675272518646012e-07, + "logits/chosen": -2.704784631729126, + "logits/rejected": -2.6682817935943604, + "logps/chosen": -297.62274169921875, + "logps/rejected": -330.6324462890625, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19843515753746033, + "rewards/margins": 9.420347213745117, + "rewards/rejected": -9.618782997131348, + "step": 4590 + }, + { + "epoch": 2.37, + "learning_rate": 1.1579651941097724e-07, + "logits/chosen": -2.6055915355682373, + "logits/rejected": -2.6153995990753174, + "logps/chosen": -305.62933349609375, + "logps/rejected": -291.2359924316406, + "loss": 0.0182, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7859185934066772, + "rewards/margins": 8.813383102416992, + "rewards/rejected": -10.599302291870117, + "step": 4600 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -2.4565374851226807, + "eval_logits/rejected": -2.416307210922241, + "eval_logps/chosen": -293.22906494140625, + "eval_logps/rejected": -332.3355712890625, + "eval_loss": 0.7476168870925903, + "eval_rewards/accuracies": 0.828125, + "eval_rewards/chosen": -3.8993959426879883, + "eval_rewards/margins": 3.637169361114502, + "eval_rewards/rejected": -7.536564826965332, + "eval_runtime": 57.2122, + "eval_samples_per_second": 17.479, + "eval_steps_per_second": 0.28, + "step": 4600 + }, + { + "epoch": 2.38, + "learning_rate": 1.1484031363549436e-07, + "logits/chosen": -2.5126757621765137, + "logits/rejected": -2.449023962020874, + "logps/chosen": -327.66717529296875, + "logps/rejected": -361.0265808105469, + "loss": 0.0268, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0840609073638916, + "rewards/margins": 10.741630554199219, + "rewards/rejected": -11.825691223144531, + "step": 4610 + }, + { + "epoch": 2.39, + "learning_rate": 1.1388410786001147e-07, + "logits/chosen": -2.6590983867645264, + "logits/rejected": -2.688147783279419, + "logps/chosen": -304.8904113769531, + "logps/rejected": -383.8213195800781, + "loss": 0.0162, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8201377391815186, + "rewards/margins": 9.453073501586914, + "rewards/rejected": -11.273211479187012, + "step": 4620 + }, + { + "epoch": 2.39, + "learning_rate": 1.1292790208452859e-07, + "logits/chosen": -2.6834404468536377, + "logits/rejected": -2.6824703216552734, + "logps/chosen": -271.0035400390625, + "logps/rejected": -379.20989990234375, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0902073383331299, + "rewards/margins": 11.127284049987793, + "rewards/rejected": -12.21749210357666, + "step": 4630 + }, + { + "epoch": 2.4, + "learning_rate": 1.119716963090457e-07, + "logits/chosen": -2.3537399768829346, + "logits/rejected": -2.4233551025390625, + "logps/chosen": -216.2086944580078, + "logps/rejected": -297.00640869140625, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2917330265045166, + "rewards/margins": 7.949918270111084, + "rewards/rejected": -10.24165153503418, + "step": 4640 + }, + { + "epoch": 2.4, + "learning_rate": 1.1101549053356282e-07, + "logits/chosen": -2.7646780014038086, + "logits/rejected": -2.6880381107330322, + "logps/chosen": -306.4629821777344, + "logps/rejected": -336.5583190917969, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0529628992080688, + "rewards/margins": 8.59121036529541, + "rewards/rejected": -9.644172668457031, + "step": 4650 + }, + { + "epoch": 2.41, + "learning_rate": 1.1005928475807993e-07, + "logits/chosen": -2.550281286239624, + "logits/rejected": -2.499551296234131, + "logps/chosen": -284.04730224609375, + "logps/rejected": -312.99896240234375, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5507326126098633, + "rewards/margins": 9.462206840515137, + "rewards/rejected": -11.012939453125, + "step": 4660 + }, + { + "epoch": 2.41, + "learning_rate": 1.0910307898259705e-07, + "logits/chosen": -2.3352179527282715, + "logits/rejected": -2.438673973083496, + "logps/chosen": -236.6370849609375, + "logps/rejected": -284.3169250488281, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.611262559890747, + "rewards/margins": 8.59797477722168, + "rewards/rejected": -10.209238052368164, + "step": 4670 + }, + { + "epoch": 2.42, + "learning_rate": 1.0814687320711418e-07, + "logits/chosen": -2.474139928817749, + "logits/rejected": -2.377544641494751, + "logps/chosen": -238.1358184814453, + "logps/rejected": -414.88720703125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9006067514419556, + "rewards/margins": 9.972890853881836, + "rewards/rejected": -10.873498916625977, + "step": 4680 + }, + { + "epoch": 2.42, + "learning_rate": 1.0719066743163128e-07, + "logits/chosen": -2.493590831756592, + "logits/rejected": -2.6044669151306152, + "logps/chosen": -323.1622009277344, + "logps/rejected": -323.79510498046875, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.587964415550232, + "rewards/margins": 9.504448890686035, + "rewards/rejected": -11.092413902282715, + "step": 4690 + }, + { + "epoch": 2.43, + "learning_rate": 1.062344616561484e-07, + "logits/chosen": -2.638388156890869, + "logits/rejected": -2.634883403778076, + "logps/chosen": -368.1080627441406, + "logps/rejected": -507.3169860839844, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11677348613739014, + "rewards/margins": 13.523852348327637, + "rewards/rejected": -13.640626907348633, + "step": 4700 + }, + { + "epoch": 2.43, + "eval_logits/chosen": -2.4100139141082764, + "eval_logits/rejected": -2.369899272918701, + "eval_logps/chosen": -294.79522705078125, + "eval_logps/rejected": -332.7344665527344, + "eval_loss": 0.7198817133903503, + "eval_rewards/accuracies": 0.84375, + "eval_rewards/chosen": -4.056015968322754, + "eval_rewards/margins": 3.5204358100891113, + "eval_rewards/rejected": -7.576451778411865, + "eval_runtime": 55.0706, + "eval_samples_per_second": 18.158, + "eval_steps_per_second": 0.291, + "step": 4700 + }, + { + "epoch": 2.43, + "learning_rate": 1.0527825588066551e-07, + "logits/chosen": -2.615658760070801, + "logits/rejected": -2.48193097114563, + "logps/chosen": -350.5819091796875, + "logps/rejected": -332.139892578125, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4044158458709717, + "rewards/margins": 10.358014106750488, + "rewards/rejected": -11.762430191040039, + "step": 4710 + }, + { + "epoch": 2.44, + "learning_rate": 1.0432205010518264e-07, + "logits/chosen": -2.6633851528167725, + "logits/rejected": -2.6755900382995605, + "logps/chosen": -244.67703247070312, + "logps/rejected": -381.3924865722656, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5906885266304016, + "rewards/margins": 11.672990798950195, + "rewards/rejected": -12.263678550720215, + "step": 4720 + }, + { + "epoch": 2.44, + "learning_rate": 1.0336584432969974e-07, + "logits/chosen": -2.4058127403259277, + "logits/rejected": -2.398548126220703, + "logps/chosen": -268.20660400390625, + "logps/rejected": -309.49078369140625, + "loss": 0.0141, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.11153459548950195, + "rewards/margins": 9.980080604553223, + "rewards/rejected": -9.868546485900879, + "step": 4730 + }, + { + "epoch": 2.45, + "learning_rate": 1.0240963855421686e-07, + "logits/chosen": -2.542297601699829, + "logits/rejected": -2.5843400955200195, + "logps/chosen": -385.5765686035156, + "logps/rejected": -344.6966552734375, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7057178020477295, + "rewards/margins": 9.097609519958496, + "rewards/rejected": -10.803327560424805, + "step": 4740 + }, + { + "epoch": 2.45, + "learning_rate": 1.0145343277873399e-07, + "logits/chosen": -2.505624771118164, + "logits/rejected": -2.4930660724639893, + "logps/chosen": -330.05987548828125, + "logps/rejected": -383.5957336425781, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8719791173934937, + "rewards/margins": 10.555585861206055, + "rewards/rejected": -11.427566528320312, + "step": 4750 + }, + { + "epoch": 2.46, + "learning_rate": 1.004972270032511e-07, + "logits/chosen": -2.2423624992370605, + "logits/rejected": -2.250560760498047, + "logps/chosen": -287.89349365234375, + "logps/rejected": -298.4164123535156, + "loss": 0.0167, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.064730167388916, + "rewards/margins": 7.943607330322266, + "rewards/rejected": -9.00833797454834, + "step": 4760 + }, + { + "epoch": 2.46, + "learning_rate": 9.95410212277682e-08, + "logits/chosen": -2.6729438304901123, + "logits/rejected": -2.5839288234710693, + "logps/chosen": -304.6081237792969, + "logps/rejected": -281.4034423828125, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.613656759262085, + "rewards/margins": 8.870094299316406, + "rewards/rejected": -9.483750343322754, + "step": 4770 + }, + { + "epoch": 2.47, + "learning_rate": 9.858481545228532e-08, + "logits/chosen": -2.431548833847046, + "logits/rejected": -2.5211846828460693, + "logps/chosen": -185.5460205078125, + "logps/rejected": -266.6904602050781, + "loss": 0.0164, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3175272941589355, + "rewards/margins": 8.721592903137207, + "rewards/rejected": -10.039118766784668, + "step": 4780 + }, + { + "epoch": 2.47, + "learning_rate": 9.762860967680245e-08, + "logits/chosen": -2.5315418243408203, + "logits/rejected": -2.6745972633361816, + "logps/chosen": -215.60311889648438, + "logps/rejected": -252.6163330078125, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5650713443756104, + "rewards/margins": 8.301115036010742, + "rewards/rejected": -9.866186141967773, + "step": 4790 + }, + { + "epoch": 2.48, + "learning_rate": 9.667240390131957e-08, + "logits/chosen": -2.6866960525512695, + "logits/rejected": -2.6582419872283936, + "logps/chosen": -263.9376220703125, + "logps/rejected": -329.9527587890625, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7252375483512878, + "rewards/margins": 12.293913841247559, + "rewards/rejected": -13.01915168762207, + "step": 4800 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -2.430349826812744, + "eval_logits/rejected": -2.3925321102142334, + "eval_logps/chosen": -290.84771728515625, + "eval_logps/rejected": -328.32550048828125, + "eval_loss": 0.7047879695892334, + "eval_rewards/accuracies": 0.875, + "eval_rewards/chosen": -3.6612637042999268, + "eval_rewards/margins": 3.474294662475586, + "eval_rewards/rejected": -7.135558605194092, + "eval_runtime": 56.5008, + "eval_samples_per_second": 17.699, + "eval_steps_per_second": 0.283, + "step": 4800 + }, + { + "epoch": 2.48, + "learning_rate": 9.571619812583667e-08, + "logits/chosen": -2.3522887229919434, + "logits/rejected": -2.5020272731781006, + "logps/chosen": -404.24993896484375, + "logps/rejected": -365.1546936035156, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5152179598808289, + "rewards/margins": 10.364774703979492, + "rewards/rejected": -10.879993438720703, + "step": 4810 + }, + { + "epoch": 2.49, + "learning_rate": 9.47599923503538e-08, + "logits/chosen": -2.634892225265503, + "logits/rejected": -2.660521984100342, + "logps/chosen": -330.85308837890625, + "logps/rejected": -385.0195617675781, + "loss": 0.0189, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.17577147483825684, + "rewards/margins": 10.900343894958496, + "rewards/rejected": -11.076115608215332, + "step": 4820 + }, + { + "epoch": 2.49, + "learning_rate": 9.380378657487091e-08, + "logits/chosen": -2.514988422393799, + "logits/rejected": -2.510554790496826, + "logps/chosen": -250.59939575195312, + "logps/rejected": -327.1246643066406, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6027127504348755, + "rewards/margins": 11.022318840026855, + "rewards/rejected": -11.625032424926758, + "step": 4830 + }, + { + "epoch": 2.5, + "learning_rate": 9.284758079938803e-08, + "logits/chosen": -2.7160019874572754, + "logits/rejected": -2.725782632827759, + "logps/chosen": -366.26788330078125, + "logps/rejected": -313.48223876953125, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3688530921936035, + "rewards/margins": 8.293670654296875, + "rewards/rejected": -9.66252326965332, + "step": 4840 + }, + { + "epoch": 2.5, + "learning_rate": 9.189137502390513e-08, + "logits/chosen": -2.5986154079437256, + "logits/rejected": -2.60760760307312, + "logps/chosen": -338.04925537109375, + "logps/rejected": -425.7908630371094, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2331068515777588, + "rewards/margins": 9.989707946777344, + "rewards/rejected": -11.222814559936523, + "step": 4850 + }, + { + "epoch": 2.51, + "learning_rate": 9.093516924842226e-08, + "logits/chosen": -2.5680298805236816, + "logits/rejected": -2.603311061859131, + "logps/chosen": -270.52349853515625, + "logps/rejected": -418.3185119628906, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3546164631843567, + "rewards/margins": 11.326202392578125, + "rewards/rejected": -11.680818557739258, + "step": 4860 + }, + { + "epoch": 2.51, + "learning_rate": 8.997896347293938e-08, + "logits/chosen": -2.529101610183716, + "logits/rejected": -2.4874515533447266, + "logps/chosen": -205.5690460205078, + "logps/rejected": -333.98065185546875, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0936403274536133, + "rewards/margins": 9.565814018249512, + "rewards/rejected": -10.659454345703125, + "step": 4870 + }, + { + "epoch": 2.52, + "learning_rate": 8.902275769745648e-08, + "logits/chosen": -2.4107840061187744, + "logits/rejected": -2.529804229736328, + "logps/chosen": -229.46145629882812, + "logps/rejected": -267.4582214355469, + "loss": 0.0123, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5835365056991577, + "rewards/margins": 7.870436668395996, + "rewards/rejected": -9.453973770141602, + "step": 4880 + }, + { + "epoch": 2.52, + "learning_rate": 8.806655192197361e-08, + "logits/chosen": -2.4289088249206543, + "logits/rejected": -2.549330949783325, + "logps/chosen": -171.3069610595703, + "logps/rejected": -321.93853759765625, + "loss": 0.0065, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.041638135910034, + "rewards/margins": 10.020352363586426, + "rewards/rejected": -12.061990737915039, + "step": 4890 + }, + { + "epoch": 2.53, + "learning_rate": 8.711034614649072e-08, + "logits/chosen": -2.7017006874084473, + "logits/rejected": -2.7009201049804688, + "logps/chosen": -279.64984130859375, + "logps/rejected": -352.21160888671875, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1117719411849976, + "rewards/margins": 10.062509536743164, + "rewards/rejected": -11.17428207397461, + "step": 4900 + }, + { + "epoch": 2.53, + "eval_logits/chosen": -2.4046812057495117, + "eval_logits/rejected": -2.36327862739563, + "eval_logps/chosen": -292.14312744140625, + "eval_logps/rejected": -330.12237548828125, + "eval_loss": 0.6975539326667786, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -3.7908036708831787, + "eval_rewards/margins": 3.524440288543701, + "eval_rewards/rejected": -7.315243721008301, + "eval_runtime": 53.2942, + "eval_samples_per_second": 18.764, + "eval_steps_per_second": 0.3, + "step": 4900 + }, + { + "epoch": 2.53, + "learning_rate": 8.615414037100784e-08, + "logits/chosen": -2.5984580516815186, + "logits/rejected": -2.746319532394409, + "logps/chosen": -321.95367431640625, + "logps/rejected": -298.1436767578125, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.020658016204834, + "rewards/margins": 8.369918823242188, + "rewards/rejected": -10.390576362609863, + "step": 4910 + }, + { + "epoch": 2.54, + "learning_rate": 8.519793459552494e-08, + "logits/chosen": -2.359086513519287, + "logits/rejected": -2.3888332843780518, + "logps/chosen": -395.9248962402344, + "logps/rejected": -374.02069091796875, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04106631129980087, + "rewards/margins": 11.742452621459961, + "rewards/rejected": -11.783517837524414, + "step": 4920 + }, + { + "epoch": 2.55, + "learning_rate": 8.424172882004207e-08, + "logits/chosen": -2.661177158355713, + "logits/rejected": -2.6514670848846436, + "logps/chosen": -387.62054443359375, + "logps/rejected": -339.0218505859375, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0007514476892538369, + "rewards/margins": 9.993762016296387, + "rewards/rejected": -9.993009567260742, + "step": 4930 + }, + { + "epoch": 2.55, + "learning_rate": 8.328552304455919e-08, + "logits/chosen": -2.335365056991577, + "logits/rejected": -2.317937135696411, + "logps/chosen": -231.7373504638672, + "logps/rejected": -296.01287841796875, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7403205633163452, + "rewards/margins": 9.786886215209961, + "rewards/rejected": -10.527207374572754, + "step": 4940 + }, + { + "epoch": 2.56, + "learning_rate": 8.23293172690763e-08, + "logits/chosen": -2.5740818977355957, + "logits/rejected": -2.612046718597412, + "logps/chosen": -265.88116455078125, + "logps/rejected": -311.5575256347656, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1604253053665161, + "rewards/margins": 10.83531665802002, + "rewards/rejected": -11.995742797851562, + "step": 4950 + }, + { + "epoch": 2.56, + "learning_rate": 8.137311149359343e-08, + "logits/chosen": -2.7012178897857666, + "logits/rejected": -2.6206467151641846, + "logps/chosen": -434.08843994140625, + "logps/rejected": -364.0971984863281, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09630658477544785, + "rewards/margins": 11.247058868408203, + "rewards/rejected": -11.343365669250488, + "step": 4960 + }, + { + "epoch": 2.57, + "learning_rate": 8.041690571811053e-08, + "logits/chosen": -2.614105463027954, + "logits/rejected": -2.5202865600585938, + "logps/chosen": -219.88876342773438, + "logps/rejected": -269.26568603515625, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.203838348388672, + "rewards/margins": 8.655978202819824, + "rewards/rejected": -10.859817504882812, + "step": 4970 + }, + { + "epoch": 2.57, + "learning_rate": 7.946069994262765e-08, + "logits/chosen": -2.546452045440674, + "logits/rejected": -2.6220192909240723, + "logps/chosen": -294.5769958496094, + "logps/rejected": -284.33343505859375, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0741980075836182, + "rewards/margins": 9.485953330993652, + "rewards/rejected": -10.560152053833008, + "step": 4980 + }, + { + "epoch": 2.58, + "learning_rate": 7.850449416714476e-08, + "logits/chosen": -2.7545557022094727, + "logits/rejected": -2.676429033279419, + "logps/chosen": -480.96600341796875, + "logps/rejected": -401.0008850097656, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3216051459312439, + "rewards/margins": 9.37825870513916, + "rewards/rejected": -9.699864387512207, + "step": 4990 + }, + { + "epoch": 2.58, + "learning_rate": 7.754828839166188e-08, + "logits/chosen": -2.48799467086792, + "logits/rejected": -2.4741270542144775, + "logps/chosen": -251.6031036376953, + "logps/rejected": -321.9014587402344, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3492779731750488, + "rewards/margins": 8.896702766418457, + "rewards/rejected": -10.245981216430664, + "step": 5000 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.4194068908691406, + "eval_logits/rejected": -2.3763530254364014, + "eval_logps/chosen": -293.284423828125, + "eval_logps/rejected": -332.5270690917969, + "eval_loss": 0.7198395133018494, + "eval_rewards/accuracies": 0.828125, + "eval_rewards/chosen": -3.9049317836761475, + "eval_rewards/margins": 3.650782823562622, + "eval_rewards/rejected": -7.555714130401611, + "eval_runtime": 56.8998, + "eval_samples_per_second": 17.575, + "eval_steps_per_second": 0.281, + "step": 5000 + }, + { + "epoch": 2.59, + "learning_rate": 7.6592082616179e-08, + "logits/chosen": -2.4661271572113037, + "logits/rejected": -2.477613687515259, + "logps/chosen": -245.18594360351562, + "logps/rejected": -335.5259094238281, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.542055368423462, + "rewards/margins": 10.06078052520752, + "rewards/rejected": -11.602836608886719, + "step": 5010 + }, + { + "epoch": 2.59, + "learning_rate": 7.563587684069611e-08, + "logits/chosen": -2.5083346366882324, + "logits/rejected": -2.643256187438965, + "logps/chosen": -207.7921600341797, + "logps/rejected": -385.1307678222656, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6641393899917603, + "rewards/margins": 10.65031909942627, + "rewards/rejected": -12.314460754394531, + "step": 5020 + }, + { + "epoch": 2.6, + "learning_rate": 7.467967106521324e-08, + "logits/chosen": -2.588287830352783, + "logits/rejected": -2.5413451194763184, + "logps/chosen": -273.2277526855469, + "logps/rejected": -238.3046875, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6997830867767334, + "rewards/margins": 7.600827217102051, + "rewards/rejected": -8.300610542297363, + "step": 5030 + }, + { + "epoch": 2.6, + "learning_rate": 7.372346528973034e-08, + "logits/chosen": -2.5987842082977295, + "logits/rejected": -2.5648391246795654, + "logps/chosen": -217.76416015625, + "logps/rejected": -320.9278259277344, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9506279230117798, + "rewards/margins": 10.49673080444336, + "rewards/rejected": -12.447359085083008, + "step": 5040 + }, + { + "epoch": 2.61, + "learning_rate": 7.276725951424746e-08, + "logits/chosen": -2.5334415435791016, + "logits/rejected": -2.48858642578125, + "logps/chosen": -171.40257263183594, + "logps/rejected": -390.48590087890625, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9724035263061523, + "rewards/margins": 8.248844146728516, + "rewards/rejected": -10.2212495803833, + "step": 5050 + }, + { + "epoch": 2.61, + "learning_rate": 7.181105373876457e-08, + "logits/chosen": -2.483840227127075, + "logits/rejected": -2.437764883041382, + "logps/chosen": -204.07522583007812, + "logps/rejected": -299.15594482421875, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3759775161743164, + "rewards/margins": 9.727631568908691, + "rewards/rejected": -11.103609085083008, + "step": 5060 + }, + { + "epoch": 2.62, + "learning_rate": 7.08548479632817e-08, + "logits/chosen": -2.751817226409912, + "logits/rejected": -2.6693196296691895, + "logps/chosen": -372.95458984375, + "logps/rejected": -369.3866271972656, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.938228964805603, + "rewards/margins": 9.311058044433594, + "rewards/rejected": -10.249287605285645, + "step": 5070 + }, + { + "epoch": 2.62, + "learning_rate": 6.98986421877988e-08, + "logits/chosen": -2.541592836380005, + "logits/rejected": -2.455427646636963, + "logps/chosen": -295.2919006347656, + "logps/rejected": -412.5565490722656, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2343528270721436, + "rewards/margins": 10.338408470153809, + "rewards/rejected": -11.572762489318848, + "step": 5080 + }, + { + "epoch": 2.63, + "learning_rate": 6.894243641231592e-08, + "logits/chosen": -2.578338623046875, + "logits/rejected": -2.542959690093994, + "logps/chosen": -227.2720947265625, + "logps/rejected": -329.40032958984375, + "loss": 0.0778, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6857062578201294, + "rewards/margins": 11.191483497619629, + "rewards/rejected": -11.877190589904785, + "step": 5090 + }, + { + "epoch": 2.63, + "learning_rate": 6.798623063683305e-08, + "logits/chosen": -2.287254810333252, + "logits/rejected": -2.432054281234741, + "logps/chosen": -312.0555114746094, + "logps/rejected": -452.99169921875, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9427648782730103, + "rewards/margins": 15.484460830688477, + "rewards/rejected": -16.42722511291504, + "step": 5100 + }, + { + "epoch": 2.63, + "eval_logits/chosen": -2.3859879970550537, + "eval_logits/rejected": -2.340737819671631, + "eval_logps/chosen": -296.35302734375, + "eval_logps/rejected": -336.11944580078125, + "eval_loss": 0.7505870461463928, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -4.211794376373291, + "eval_rewards/margins": 3.703155040740967, + "eval_rewards/rejected": -7.914949893951416, + "eval_runtime": 56.2566, + "eval_samples_per_second": 17.776, + "eval_steps_per_second": 0.284, + "step": 5100 + }, + { + "epoch": 2.64, + "learning_rate": 6.703002486135017e-08, + "logits/chosen": -2.3773114681243896, + "logits/rejected": -2.5287060737609863, + "logps/chosen": -236.22640991210938, + "logps/rejected": -360.97784423828125, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9979360699653625, + "rewards/margins": 10.704690933227539, + "rewards/rejected": -11.702627182006836, + "step": 5110 + }, + { + "epoch": 2.64, + "learning_rate": 6.607381908586727e-08, + "logits/chosen": -2.642033338546753, + "logits/rejected": -2.6108345985412598, + "logps/chosen": -317.5076599121094, + "logps/rejected": -348.7528076171875, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6177361011505127, + "rewards/margins": 9.611312866210938, + "rewards/rejected": -11.229048728942871, + "step": 5120 + }, + { + "epoch": 2.65, + "learning_rate": 6.511761331038438e-08, + "logits/chosen": -2.6155383586883545, + "logits/rejected": -2.6100358963012695, + "logps/chosen": -281.2548522949219, + "logps/rejected": -298.05865478515625, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.588085651397705, + "rewards/margins": 7.020742893218994, + "rewards/rejected": -9.608829498291016, + "step": 5130 + }, + { + "epoch": 2.65, + "learning_rate": 6.416140753490151e-08, + "logits/chosen": -2.627002239227295, + "logits/rejected": -2.6328094005584717, + "logps/chosen": -421.49774169921875, + "logps/rejected": -432.20098876953125, + "loss": 0.0179, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4822900295257568, + "rewards/margins": 8.786565780639648, + "rewards/rejected": -10.268855094909668, + "step": 5140 + }, + { + "epoch": 2.66, + "learning_rate": 6.320520175941863e-08, + "logits/chosen": -2.4586381912231445, + "logits/rejected": -2.452455997467041, + "logps/chosen": -246.73715209960938, + "logps/rejected": -327.2841491699219, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.41286563873291, + "rewards/margins": 9.417495727539062, + "rewards/rejected": -11.830362319946289, + "step": 5150 + }, + { + "epoch": 2.66, + "learning_rate": 6.224899598393573e-08, + "logits/chosen": -2.557018756866455, + "logits/rejected": -2.4926464557647705, + "logps/chosen": -285.82635498046875, + "logps/rejected": -348.3973693847656, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4971519708633423, + "rewards/margins": 9.434330940246582, + "rewards/rejected": -10.931482315063477, + "step": 5160 + }, + { + "epoch": 2.67, + "learning_rate": 6.129279020845286e-08, + "logits/chosen": -2.4606575965881348, + "logits/rejected": -2.5436136722564697, + "logps/chosen": -265.47454833984375, + "logps/rejected": -310.14862060546875, + "loss": 0.0113, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3885912597179413, + "rewards/margins": 10.448331832885742, + "rewards/rejected": -10.836923599243164, + "step": 5170 + }, + { + "epoch": 2.67, + "learning_rate": 6.033658443296998e-08, + "logits/chosen": -2.5347704887390137, + "logits/rejected": -2.484384059906006, + "logps/chosen": -266.8102111816406, + "logps/rejected": -340.2280578613281, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.881771445274353, + "rewards/margins": 11.114812850952148, + "rewards/rejected": -11.99658489227295, + "step": 5180 + }, + { + "epoch": 2.68, + "learning_rate": 5.9380378657487085e-08, + "logits/chosen": -2.5079243183135986, + "logits/rejected": -2.5110316276550293, + "logps/chosen": -455.3853454589844, + "logps/rejected": -375.2730407714844, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6811244487762451, + "rewards/margins": 11.579205513000488, + "rewards/rejected": -12.26032829284668, + "step": 5190 + }, + { + "epoch": 2.68, + "learning_rate": 5.842417288200421e-08, + "logits/chosen": -2.665579080581665, + "logits/rejected": -2.835705280303955, + "logps/chosen": -388.7041320800781, + "logps/rejected": -376.1544494628906, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7552551627159119, + "rewards/margins": 9.90630054473877, + "rewards/rejected": -10.6615571975708, + "step": 5200 + }, + { + "epoch": 2.68, + "eval_logits/chosen": -2.394641160964966, + "eval_logits/rejected": -2.350865125656128, + "eval_logps/chosen": -296.6682434082031, + "eval_logps/rejected": -336.7720642089844, + "eval_loss": 0.7407526969909668, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -4.243312835693359, + "eval_rewards/margins": 3.7369019985198975, + "eval_rewards/rejected": -7.980215549468994, + "eval_runtime": 55.9932, + "eval_samples_per_second": 17.859, + "eval_steps_per_second": 0.286, + "step": 5200 + }, + { + "epoch": 2.69, + "learning_rate": 5.7467967106521317e-08, + "logits/chosen": -2.518009901046753, + "logits/rejected": -2.5615527629852295, + "logps/chosen": -296.2488098144531, + "logps/rejected": -408.9478454589844, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9118080139160156, + "rewards/margins": 10.68850326538086, + "rewards/rejected": -12.600311279296875, + "step": 5210 + }, + { + "epoch": 2.69, + "learning_rate": 5.651176133103844e-08, + "logits/chosen": -2.740626096725464, + "logits/rejected": -2.676818370819092, + "logps/chosen": -306.84588623046875, + "logps/rejected": -342.53240966796875, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46114450693130493, + "rewards/margins": 10.382904052734375, + "rewards/rejected": -10.844049453735352, + "step": 5220 + }, + { + "epoch": 2.7, + "learning_rate": 5.555555555555555e-08, + "logits/chosen": -2.5409655570983887, + "logits/rejected": -2.4781863689422607, + "logps/chosen": -215.8829345703125, + "logps/rejected": -301.06756591796875, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1217892169952393, + "rewards/margins": 8.91219711303711, + "rewards/rejected": -10.033987045288086, + "step": 5230 + }, + { + "epoch": 2.71, + "learning_rate": 5.459934978007267e-08, + "logits/chosen": -2.6471657752990723, + "logits/rejected": -2.611330509185791, + "logps/chosen": -273.4901123046875, + "logps/rejected": -403.7444763183594, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45464539527893066, + "rewards/margins": 11.401620864868164, + "rewards/rejected": -11.856266021728516, + "step": 5240 + }, + { + "epoch": 2.71, + "learning_rate": 5.3643144004589786e-08, + "logits/chosen": -2.542269706726074, + "logits/rejected": -2.433465003967285, + "logps/chosen": -301.2662048339844, + "logps/rejected": -399.2783203125, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4464475214481354, + "rewards/margins": 13.448400497436523, + "rewards/rejected": -13.001953125, + "step": 5250 + }, + { + "epoch": 2.72, + "learning_rate": 5.26869382291069e-08, + "logits/chosen": -2.6752572059631348, + "logits/rejected": -2.7158637046813965, + "logps/chosen": -226.4488983154297, + "logps/rejected": -335.8851623535156, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019395578652620316, + "rewards/margins": 10.634721755981445, + "rewards/rejected": -10.654115676879883, + "step": 5260 + }, + { + "epoch": 2.72, + "learning_rate": 5.173073245362402e-08, + "logits/chosen": -2.265803337097168, + "logits/rejected": -2.495293617248535, + "logps/chosen": -273.8394470214844, + "logps/rejected": -262.0378112792969, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.907080888748169, + "rewards/margins": 9.230062484741211, + "rewards/rejected": -11.1371431350708, + "step": 5270 + }, + { + "epoch": 2.73, + "learning_rate": 5.077452667814113e-08, + "logits/chosen": -2.645397186279297, + "logits/rejected": -2.6353235244750977, + "logps/chosen": -234.93240356445312, + "logps/rejected": -313.4653015136719, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.619875192642212, + "rewards/margins": 9.85165786743164, + "rewards/rejected": -11.47153377532959, + "step": 5280 + }, + { + "epoch": 2.73, + "learning_rate": 4.981832090265825e-08, + "logits/chosen": -2.5697460174560547, + "logits/rejected": -2.524587631225586, + "logps/chosen": -278.901123046875, + "logps/rejected": -400.72540283203125, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3195740878582001, + "rewards/margins": 12.423995018005371, + "rewards/rejected": -12.104421615600586, + "step": 5290 + }, + { + "epoch": 2.74, + "learning_rate": 4.8862115127175364e-08, + "logits/chosen": -2.6613426208496094, + "logits/rejected": -2.5382397174835205, + "logps/chosen": -298.51617431640625, + "logps/rejected": -405.8147277832031, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6868101358413696, + "rewards/margins": 10.438592910766602, + "rewards/rejected": -12.125402450561523, + "step": 5300 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -2.384242057800293, + "eval_logits/rejected": -2.33884596824646, + "eval_logps/chosen": -297.62750244140625, + "eval_logps/rejected": -337.80126953125, + "eval_loss": 0.7552159428596497, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -4.339241981506348, + "eval_rewards/margins": 3.743894100189209, + "eval_rewards/rejected": -8.083136558532715, + "eval_runtime": 59.8742, + "eval_samples_per_second": 16.702, + "eval_steps_per_second": 0.267, + "step": 5300 + }, + { + "epoch": 2.74, + "learning_rate": 4.790590935169248e-08, + "logits/chosen": -2.6838698387145996, + "logits/rejected": -2.574967384338379, + "logps/chosen": -274.59368896484375, + "logps/rejected": -469.4027404785156, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4363610744476318, + "rewards/margins": 10.517416000366211, + "rewards/rejected": -11.953778266906738, + "step": 5310 + }, + { + "epoch": 2.75, + "learning_rate": 4.69497035762096e-08, + "logits/chosen": -2.537161350250244, + "logits/rejected": -2.4791531562805176, + "logps/chosen": -344.87347412109375, + "logps/rejected": -444.57366943359375, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.297786235809326, + "rewards/margins": 12.291933059692383, + "rewards/rejected": -14.589719772338867, + "step": 5320 + }, + { + "epoch": 2.75, + "learning_rate": 4.599349780072671e-08, + "logits/chosen": -2.5215706825256348, + "logits/rejected": -2.408939838409424, + "logps/chosen": -344.95184326171875, + "logps/rejected": -272.84417724609375, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2328124046325684, + "rewards/margins": 9.55348014831543, + "rewards/rejected": -11.78629207611084, + "step": 5330 + }, + { + "epoch": 2.76, + "learning_rate": 4.5037292025243834e-08, + "logits/chosen": -2.4096181392669678, + "logits/rejected": -2.3585500717163086, + "logps/chosen": -231.6038055419922, + "logps/rejected": -414.8946838378906, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.199920892715454, + "rewards/margins": 9.544143676757812, + "rewards/rejected": -11.744064331054688, + "step": 5340 + }, + { + "epoch": 2.76, + "learning_rate": 4.408108624976094e-08, + "logits/chosen": -2.6306357383728027, + "logits/rejected": -2.373485565185547, + "logps/chosen": -273.1640625, + "logps/rejected": -362.6429138183594, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7220970392227173, + "rewards/margins": 11.861469268798828, + "rewards/rejected": -12.583566665649414, + "step": 5350 + }, + { + "epoch": 2.77, + "learning_rate": 4.3124880474278065e-08, + "logits/chosen": -2.6821742057800293, + "logits/rejected": -2.5935044288635254, + "logps/chosen": -330.2795715332031, + "logps/rejected": -295.5904541015625, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4781021177768707, + "rewards/margins": 9.99770450592041, + "rewards/rejected": -10.475805282592773, + "step": 5360 + }, + { + "epoch": 2.77, + "learning_rate": 4.2168674698795174e-08, + "logits/chosen": -2.668886184692383, + "logits/rejected": -2.7140769958496094, + "logps/chosen": -188.55136108398438, + "logps/rejected": -355.8598327636719, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.314338207244873, + "rewards/margins": 8.853775978088379, + "rewards/rejected": -11.168115615844727, + "step": 5370 + }, + { + "epoch": 2.78, + "learning_rate": 4.1212468923312296e-08, + "logits/chosen": -2.5291595458984375, + "logits/rejected": -2.4308247566223145, + "logps/chosen": -271.3199768066406, + "logps/rejected": -382.0475158691406, + "loss": 0.0167, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8199418783187866, + "rewards/margins": 9.165318489074707, + "rewards/rejected": -10.985260009765625, + "step": 5380 + }, + { + "epoch": 2.78, + "learning_rate": 4.025626314782941e-08, + "logits/chosen": -2.578953266143799, + "logits/rejected": -2.5158464908599854, + "logps/chosen": -277.73052978515625, + "logps/rejected": -347.6210632324219, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26607850193977356, + "rewards/margins": 11.397039413452148, + "rewards/rejected": -11.663119316101074, + "step": 5390 + }, + { + "epoch": 2.79, + "learning_rate": 3.930005737234653e-08, + "logits/chosen": -2.543391704559326, + "logits/rejected": -2.6446430683135986, + "logps/chosen": -216.94741821289062, + "logps/rejected": -307.4268493652344, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3559612035751343, + "rewards/margins": 9.72540283203125, + "rewards/rejected": -11.0813627243042, + "step": 5400 + }, + { + "epoch": 2.79, + "eval_logits/chosen": -2.3737339973449707, + "eval_logits/rejected": -2.3286330699920654, + "eval_logps/chosen": -296.6304016113281, + "eval_logps/rejected": -336.73223876953125, + "eval_loss": 0.7403773069381714, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -4.239532470703125, + "eval_rewards/margins": 3.7366957664489746, + "eval_rewards/rejected": -7.9762282371521, + "eval_runtime": 58.906, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 0.272, + "step": 5400 + }, + { + "epoch": 2.79, + "learning_rate": 3.8343851596863644e-08, + "logits/chosen": -2.68801212310791, + "logits/rejected": -2.5317561626434326, + "logps/chosen": -327.53106689453125, + "logps/rejected": -318.7012939453125, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34214359521865845, + "rewards/margins": 12.371678352355957, + "rewards/rejected": -12.713821411132812, + "step": 5410 + }, + { + "epoch": 2.8, + "learning_rate": 3.738764582138076e-08, + "logits/chosen": -2.63051700592041, + "logits/rejected": -2.5712480545043945, + "logps/chosen": -292.55035400390625, + "logps/rejected": -406.0823059082031, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4555866718292236, + "rewards/margins": 9.967303276062012, + "rewards/rejected": -12.422890663146973, + "step": 5420 + }, + { + "epoch": 2.8, + "learning_rate": 3.6431440045897875e-08, + "logits/chosen": -2.558973550796509, + "logits/rejected": -2.5760269165039062, + "logps/chosen": -288.33062744140625, + "logps/rejected": -438.51007080078125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0792489051818848, + "rewards/margins": 10.678377151489258, + "rewards/rejected": -11.7576265335083, + "step": 5430 + }, + { + "epoch": 2.81, + "learning_rate": 3.547523427041499e-08, + "logits/chosen": -2.607342481613159, + "logits/rejected": -2.609557628631592, + "logps/chosen": -283.79608154296875, + "logps/rejected": -293.2716369628906, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1565558910369873, + "rewards/margins": 10.7040433883667, + "rewards/rejected": -11.860601425170898, + "step": 5440 + }, + { + "epoch": 2.81, + "learning_rate": 3.4519028494932106e-08, + "logits/chosen": -2.703679084777832, + "logits/rejected": -2.5151591300964355, + "logps/chosen": -353.95758056640625, + "logps/rejected": -433.54766845703125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7000893354415894, + "rewards/margins": 11.649955749511719, + "rewards/rejected": -12.350044250488281, + "step": 5450 + }, + { + "epoch": 2.82, + "learning_rate": 3.356282271944923e-08, + "logits/chosen": -2.603567600250244, + "logits/rejected": -2.502267360687256, + "logps/chosen": -226.2731170654297, + "logps/rejected": -341.97320556640625, + "loss": 0.0146, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9284445643424988, + "rewards/margins": 11.389801979064941, + "rewards/rejected": -12.318245887756348, + "step": 5460 + }, + { + "epoch": 2.82, + "learning_rate": 3.260661694396634e-08, + "logits/chosen": -2.5252528190612793, + "logits/rejected": -2.5249342918395996, + "logps/chosen": -263.4516906738281, + "logps/rejected": -313.29998779296875, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3164348602294922, + "rewards/margins": 11.594769477844238, + "rewards/rejected": -11.911203384399414, + "step": 5470 + }, + { + "epoch": 2.83, + "learning_rate": 3.165041116848346e-08, + "logits/chosen": -2.660788059234619, + "logits/rejected": -2.5421648025512695, + "logps/chosen": -253.57839965820312, + "logps/rejected": -402.5025329589844, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6556440591812134, + "rewards/margins": 12.180809020996094, + "rewards/rejected": -13.836453437805176, + "step": 5480 + }, + { + "epoch": 2.83, + "learning_rate": 3.0694205393000576e-08, + "logits/chosen": -2.5150065422058105, + "logits/rejected": -2.4512484073638916, + "logps/chosen": -212.173828125, + "logps/rejected": -332.0416564941406, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4317614436149597, + "rewards/margins": 11.226727485656738, + "rewards/rejected": -10.794965744018555, + "step": 5490 + }, + { + "epoch": 2.84, + "learning_rate": 2.9737999617517688e-08, + "logits/chosen": -2.5827393531799316, + "logits/rejected": -2.5749595165252686, + "logps/chosen": -308.6050109863281, + "logps/rejected": -365.627197265625, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4415368139743805, + "rewards/margins": 10.942670822143555, + "rewards/rejected": -11.384206771850586, + "step": 5500 + }, + { + "epoch": 2.84, + "eval_logits/chosen": -2.364140272140503, + "eval_logits/rejected": -2.319963216781616, + "eval_logps/chosen": -298.70074462890625, + "eval_logps/rejected": -339.1661682128906, + "eval_loss": 0.7524814605712891, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -4.44656229019165, + "eval_rewards/margins": 3.773061752319336, + "eval_rewards/rejected": -8.219624519348145, + "eval_runtime": 58.809, + "eval_samples_per_second": 17.004, + "eval_steps_per_second": 0.272, + "step": 5500 + }, + { + "epoch": 2.84, + "learning_rate": 2.8781793842034804e-08, + "logits/chosen": -2.426349401473999, + "logits/rejected": -2.384749174118042, + "logps/chosen": -259.9743347167969, + "logps/rejected": -300.3887634277344, + "loss": 0.015, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3269202709198, + "rewards/margins": 8.843810081481934, + "rewards/rejected": -11.170731544494629, + "step": 5510 + }, + { + "epoch": 2.85, + "learning_rate": 2.782558806655192e-08, + "logits/chosen": -2.5341413021087646, + "logits/rejected": -2.5924274921417236, + "logps/chosen": -302.21563720703125, + "logps/rejected": -409.6150817871094, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3284962177276611, + "rewards/margins": 9.481134414672852, + "rewards/rejected": -10.809629440307617, + "step": 5520 + }, + { + "epoch": 2.85, + "learning_rate": 2.6869382291069035e-08, + "logits/chosen": -2.4547677040100098, + "logits/rejected": -2.457869052886963, + "logps/chosen": -229.97561645507812, + "logps/rejected": -327.1774597167969, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2366197109222412, + "rewards/margins": 10.809396743774414, + "rewards/rejected": -12.04601764678955, + "step": 5530 + }, + { + "epoch": 2.86, + "learning_rate": 2.591317651558615e-08, + "logits/chosen": -2.308411121368408, + "logits/rejected": -2.4219300746917725, + "logps/chosen": -283.2604675292969, + "logps/rejected": -351.8711242675781, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.108804225921631, + "rewards/margins": 10.290410995483398, + "rewards/rejected": -12.399213790893555, + "step": 5540 + }, + { + "epoch": 2.87, + "learning_rate": 2.4956970740103267e-08, + "logits/chosen": -2.5385169982910156, + "logits/rejected": -2.4845941066741943, + "logps/chosen": -319.16473388671875, + "logps/rejected": -403.1438293457031, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6357721090316772, + "rewards/margins": 10.850339889526367, + "rewards/rejected": -12.486112594604492, + "step": 5550 + }, + { + "epoch": 2.87, + "learning_rate": 2.4000764964620386e-08, + "logits/chosen": -2.6393580436706543, + "logits/rejected": -2.731678009033203, + "logps/chosen": -312.54034423828125, + "logps/rejected": -417.93658447265625, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7174618244171143, + "rewards/margins": 9.234588623046875, + "rewards/rejected": -10.952049255371094, + "step": 5560 + }, + { + "epoch": 2.88, + "learning_rate": 2.30445591891375e-08, + "logits/chosen": -2.5211081504821777, + "logits/rejected": -2.4729580879211426, + "logps/chosen": -307.0782775878906, + "logps/rejected": -450.36962890625, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05847327783703804, + "rewards/margins": 13.286686897277832, + "rewards/rejected": -13.228212356567383, + "step": 5570 + }, + { + "epoch": 2.88, + "learning_rate": 2.2088353413654617e-08, + "logits/chosen": -2.313760757446289, + "logits/rejected": -2.362217664718628, + "logps/chosen": -294.3525390625, + "logps/rejected": -331.1678771972656, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.203790307044983, + "rewards/margins": 14.620699882507324, + "rewards/rejected": -13.416910171508789, + "step": 5580 + }, + { + "epoch": 2.89, + "learning_rate": 2.1132147638171733e-08, + "logits/chosen": -2.0504655838012695, + "logits/rejected": -2.1224112510681152, + "logps/chosen": -262.3179626464844, + "logps/rejected": -329.19732666015625, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7382757067680359, + "rewards/margins": 10.974761962890625, + "rewards/rejected": -11.713037490844727, + "step": 5590 + }, + { + "epoch": 2.89, + "learning_rate": 2.0175941862688848e-08, + "logits/chosen": -2.4296658039093018, + "logits/rejected": -2.384312391281128, + "logps/chosen": -263.25592041015625, + "logps/rejected": -385.36688232421875, + "loss": 0.0077, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0216162204742432, + "rewards/margins": 11.878069877624512, + "rewards/rejected": -12.899686813354492, + "step": 5600 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.351677894592285, + "eval_logits/rejected": -2.3077552318573, + "eval_logps/chosen": -299.8206481933594, + "eval_logps/rejected": -340.4544677734375, + "eval_loss": 0.7519664168357849, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -4.5585551261901855, + "eval_rewards/margins": 3.7899010181427, + "eval_rewards/rejected": -8.348456382751465, + "eval_runtime": 57.0149, + "eval_samples_per_second": 17.539, + "eval_steps_per_second": 0.281, + "step": 5600 + }, + { + "epoch": 2.9, + "learning_rate": 1.9219736087205964e-08, + "logits/chosen": -2.4466593265533447, + "logits/rejected": -2.5641415119171143, + "logps/chosen": -313.849609375, + "logps/rejected": -355.71954345703125, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.058788836002349854, + "rewards/margins": 9.841516494750977, + "rewards/rejected": -9.782726287841797, + "step": 5610 + }, + { + "epoch": 2.9, + "learning_rate": 1.826353031172308e-08, + "logits/chosen": -2.6215555667877197, + "logits/rejected": -2.596318244934082, + "logps/chosen": -334.0271911621094, + "logps/rejected": -374.85211181640625, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0268580913543701, + "rewards/margins": 10.430700302124023, + "rewards/rejected": -11.45755672454834, + "step": 5620 + }, + { + "epoch": 2.91, + "learning_rate": 1.73073245362402e-08, + "logits/chosen": -2.3654887676239014, + "logits/rejected": -2.3259222507476807, + "logps/chosen": -418.31524658203125, + "logps/rejected": -296.1111145019531, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1241583824157715, + "rewards/margins": 9.49864673614502, + "rewards/rejected": -11.622804641723633, + "step": 5630 + }, + { + "epoch": 2.91, + "learning_rate": 1.6351118760757314e-08, + "logits/chosen": -2.4572885036468506, + "logits/rejected": -2.4687421321868896, + "logps/chosen": -250.23764038085938, + "logps/rejected": -250.4460906982422, + "loss": 0.0111, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8683540225028992, + "rewards/margins": 8.310081481933594, + "rewards/rejected": -9.178436279296875, + "step": 5640 + }, + { + "epoch": 2.92, + "learning_rate": 1.539491298527443e-08, + "logits/chosen": -2.6408703327178955, + "logits/rejected": -2.641308307647705, + "logps/chosen": -283.0168151855469, + "logps/rejected": -405.4056701660156, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7451741099357605, + "rewards/margins": 11.91575813293457, + "rewards/rejected": -12.660932540893555, + "step": 5650 + }, + { + "epoch": 2.92, + "learning_rate": 1.4438707209791546e-08, + "logits/chosen": -2.531616687774658, + "logits/rejected": -2.6468756198883057, + "logps/chosen": -414.9684143066406, + "logps/rejected": -356.252685546875, + "loss": 0.0157, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.21153855323791504, + "rewards/margins": 10.415987014770508, + "rewards/rejected": -10.627525329589844, + "step": 5660 + }, + { + "epoch": 2.93, + "learning_rate": 1.3482501434308661e-08, + "logits/chosen": -2.300788402557373, + "logits/rejected": -2.308450937271118, + "logps/chosen": -356.91632080078125, + "logps/rejected": -350.33892822265625, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6116657257080078, + "rewards/margins": 11.264276504516602, + "rewards/rejected": -12.875943183898926, + "step": 5670 + }, + { + "epoch": 2.93, + "learning_rate": 1.2526295658825777e-08, + "logits/chosen": -2.603456497192383, + "logits/rejected": -2.6166439056396484, + "logps/chosen": -311.9185791015625, + "logps/rejected": -450.9242248535156, + "loss": 0.0147, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2669516801834106, + "rewards/margins": 11.346095085144043, + "rewards/rejected": -12.613046646118164, + "step": 5680 + }, + { + "epoch": 2.94, + "learning_rate": 1.1570089883342895e-08, + "logits/chosen": -2.5220370292663574, + "logits/rejected": -2.4531850814819336, + "logps/chosen": -311.7686462402344, + "logps/rejected": -400.8462829589844, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0794099569320679, + "rewards/margins": 12.467567443847656, + "rewards/rejected": -13.546978950500488, + "step": 5690 + }, + { + "epoch": 2.94, + "learning_rate": 1.061388410786001e-08, + "logits/chosen": -2.4289803504943848, + "logits/rejected": -2.4906742572784424, + "logps/chosen": -285.3009338378906, + "logps/rejected": -263.3441467285156, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.07789945602417, + "rewards/margins": 7.724274635314941, + "rewards/rejected": -9.80217456817627, + "step": 5700 + }, + { + "epoch": 2.94, + "eval_logits/chosen": -2.3509910106658936, + "eval_logits/rejected": -2.3062477111816406, + "eval_logps/chosen": -299.77734375, + "eval_logps/rejected": -340.47900390625, + "eval_loss": 0.7527089715003967, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -4.5542216300964355, + "eval_rewards/margins": 3.7966880798339844, + "eval_rewards/rejected": -8.350910186767578, + "eval_runtime": 55.9629, + "eval_samples_per_second": 17.869, + "eval_steps_per_second": 0.286, + "step": 5700 + }, + { + "epoch": 2.95, + "learning_rate": 9.657678332377126e-09, + "logits/chosen": -2.4378364086151123, + "logits/rejected": -2.5011210441589355, + "logps/chosen": -278.77166748046875, + "logps/rejected": -327.8222351074219, + "loss": 0.0145, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6567916870117188, + "rewards/margins": 9.313983917236328, + "rewards/rejected": -10.970773696899414, + "step": 5710 + }, + { + "epoch": 2.95, + "learning_rate": 8.701472556894243e-09, + "logits/chosen": -2.4347808361053467, + "logits/rejected": -2.4027464389801025, + "logps/chosen": -301.68988037109375, + "logps/rejected": -355.3216247558594, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6911423206329346, + "rewards/margins": 10.72395133972168, + "rewards/rejected": -13.415092468261719, + "step": 5720 + }, + { + "epoch": 2.96, + "learning_rate": 7.745266781411359e-09, + "logits/chosen": -2.4534902572631836, + "logits/rejected": -2.554394006729126, + "logps/chosen": -260.3663635253906, + "logps/rejected": -410.23223876953125, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0108587741851807, + "rewards/margins": 9.392390251159668, + "rewards/rejected": -11.403249740600586, + "step": 5730 + }, + { + "epoch": 2.96, + "learning_rate": 6.7890610059284754e-09, + "logits/chosen": -2.566368579864502, + "logits/rejected": -2.585576057434082, + "logps/chosen": -261.0205993652344, + "logps/rejected": -348.109619140625, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3029209077358246, + "rewards/margins": 10.11386775970459, + "rewards/rejected": -10.416789054870605, + "step": 5740 + }, + { + "epoch": 2.97, + "learning_rate": 5.832855230445592e-09, + "logits/chosen": -2.5495338439941406, + "logits/rejected": -2.4890074729919434, + "logps/chosen": -247.47286987304688, + "logps/rejected": -320.06011962890625, + "loss": 0.0072, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9899286031723022, + "rewards/margins": 10.633166313171387, + "rewards/rejected": -11.62309455871582, + "step": 5750 + }, + { + "epoch": 2.97, + "learning_rate": 4.8766494549627085e-09, + "logits/chosen": -2.60798978805542, + "logits/rejected": -2.477149486541748, + "logps/chosen": -305.1927795410156, + "logps/rejected": -318.5039978027344, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8671627044677734, + "rewards/margins": 9.939409255981445, + "rewards/rejected": -11.806573867797852, + "step": 5760 + }, + { + "epoch": 2.98, + "learning_rate": 3.920443679479824e-09, + "logits/chosen": -2.545316219329834, + "logits/rejected": -2.5275652408599854, + "logps/chosen": -292.89263916015625, + "logps/rejected": -323.63385009765625, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3659417629241943, + "rewards/margins": 9.62957763671875, + "rewards/rejected": -11.995519638061523, + "step": 5770 + }, + { + "epoch": 2.98, + "learning_rate": 2.96423790399694e-09, + "logits/chosen": -2.48178768157959, + "logits/rejected": -2.6639437675476074, + "logps/chosen": -219.31777954101562, + "logps/rejected": -324.5710754394531, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.672484040260315, + "rewards/margins": 8.533833503723145, + "rewards/rejected": -10.206315994262695, + "step": 5780 + }, + { + "epoch": 2.99, + "learning_rate": 2.008032128514056e-09, + "logits/chosen": -2.54166841506958, + "logits/rejected": -2.5910754203796387, + "logps/chosen": -343.8594665527344, + "logps/rejected": -451.935791015625, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7438589334487915, + "rewards/margins": 10.097026824951172, + "rewards/rejected": -11.840886116027832, + "step": 5790 + }, + { + "epoch": 2.99, + "learning_rate": 1.0518263530311723e-09, + "logits/chosen": -2.5881881713867188, + "logits/rejected": -2.5880398750305176, + "logps/chosen": -201.51014709472656, + "logps/rejected": -326.72650146484375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4769667983055115, + "rewards/margins": 9.810731887817383, + "rewards/rejected": -10.287699699401855, + "step": 5800 + }, + { + "epoch": 2.99, + "eval_logits/chosen": -2.353024482727051, + "eval_logits/rejected": -2.308088779449463, + "eval_logps/chosen": -299.4037780761719, + "eval_logps/rejected": -340.0493469238281, + "eval_loss": 0.7519845962524414, + "eval_rewards/accuracies": 0.78125, + "eval_rewards/chosen": -4.5168681144714355, + "eval_rewards/margins": 3.791072130203247, + "eval_rewards/rejected": -8.307940483093262, + "eval_runtime": 55.3708, + "eval_samples_per_second": 18.06, + "eval_steps_per_second": 0.289, + "step": 5800 + }, + { + "epoch": 3.0, + "learning_rate": 9.562057754828839e-11, + "logits/chosen": -2.476783514022827, + "logits/rejected": -2.4620718955993652, + "logps/chosen": -259.29327392578125, + "logps/rejected": -435.9239196777344, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16803565621376038, + "rewards/margins": 8.936319351196289, + "rewards/rejected": -8.768282890319824, + "step": 5810 + }, + { + "epoch": 3.0, + "step": 5811, + "total_flos": 0.0, + "train_loss": 0.2172969928600547, + "train_runtime": 23865.9828, + "train_samples_per_second": 7.789, + "train_steps_per_second": 0.243 + } + ], + "logging_steps": 10, + "max_steps": 5811, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}